1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 243c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal 29*c5cc2070STejun Heo /* 30*c5cc2070STejun Heo * To implement hierarchical throttling, throtl_grps form a tree and bios 31*c5cc2070STejun Heo * are dispatched upwards level by level until they reach the top and get 32*c5cc2070STejun Heo * issued. When dispatching bios from the children and local group at each 33*c5cc2070STejun Heo * level, if the bios are dispatched into a single bio_list, there's a risk 34*c5cc2070STejun Heo * of a local or child group which can queue many bios at once filling up 35*c5cc2070STejun Heo * the list starving others. 36*c5cc2070STejun Heo * 37*c5cc2070STejun Heo * To avoid such starvation, dispatched bios are queued separately 38*c5cc2070STejun Heo * according to where they came from. When they are again dispatched to 39*c5cc2070STejun Heo * the parent, they're popped in round-robin order so that no single source 40*c5cc2070STejun Heo * hogs the dispatch window. 41*c5cc2070STejun Heo * 42*c5cc2070STejun Heo * throtl_qnode is used to keep the queued bios separated by their sources. 43*c5cc2070STejun Heo * Bios are queued to throtl_qnode which in turn is queued to 44*c5cc2070STejun Heo * throtl_service_queue and then dispatched in round-robin order. 45*c5cc2070STejun Heo * 46*c5cc2070STejun Heo * It's also used to track the reference counts on blkg's. A qnode always 47*c5cc2070STejun Heo * belongs to a throtl_grp and gets queued on itself or the parent, so 48*c5cc2070STejun Heo * incrementing the reference of the associated throtl_grp when a qnode is 49*c5cc2070STejun Heo * queued and decrementing when dequeued is enough to keep the whole blkg 50*c5cc2070STejun Heo * tree pinned while bios are in flight. 51*c5cc2070STejun Heo */ 52*c5cc2070STejun Heo struct throtl_qnode { 53*c5cc2070STejun Heo struct list_head node; /* service_queue->queued[] */ 54*c5cc2070STejun Heo struct bio_list bios; /* queued bios */ 55*c5cc2070STejun Heo struct throtl_grp *tg; /* tg this qnode belongs to */ 56*c5cc2070STejun Heo }; 57*c5cc2070STejun Heo 58c9e0332eSTejun Heo struct throtl_service_queue { 5977216b04STejun Heo struct throtl_service_queue *parent_sq; /* the parent service_queue */ 6077216b04STejun Heo 6173f0d49aSTejun Heo /* 6273f0d49aSTejun Heo * Bios queued directly to this service_queue or dispatched from 6373f0d49aSTejun Heo * children throtl_grp's. 6473f0d49aSTejun Heo */ 65*c5cc2070STejun Heo struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 6673f0d49aSTejun Heo unsigned int nr_queued[2]; /* number of queued bios */ 6773f0d49aSTejun Heo 6873f0d49aSTejun Heo /* 6973f0d49aSTejun Heo * RB tree of active children throtl_grp's, which are sorted by 7073f0d49aSTejun Heo * their ->disptime. 7173f0d49aSTejun Heo */ 72c9e0332eSTejun Heo struct rb_root pending_tree; /* RB tree of active tgs */ 73c9e0332eSTejun Heo struct rb_node *first_pending; /* first node in the tree */ 74c9e0332eSTejun Heo unsigned int nr_pending; /* # queued in the tree */ 75c9e0332eSTejun Heo unsigned long first_pending_disptime; /* disptime of the first tg */ 7669df0ab0STejun Heo struct timer_list pending_timer; /* fires on first_pending_disptime */ 77e43473b7SVivek Goyal }; 78e43473b7SVivek Goyal 795b2c16aaSTejun Heo enum tg_state_flags { 805b2c16aaSTejun Heo THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 810e9f4164STejun Heo THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 825b2c16aaSTejun Heo }; 835b2c16aaSTejun Heo 84e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 85e43473b7SVivek Goyal 868a3d2615STejun Heo /* Per-cpu group stats */ 878a3d2615STejun Heo struct tg_stats_cpu { 888a3d2615STejun Heo /* total bytes transferred */ 898a3d2615STejun Heo struct blkg_rwstat service_bytes; 908a3d2615STejun Heo /* total IOs serviced, post merge */ 918a3d2615STejun Heo struct blkg_rwstat serviced; 928a3d2615STejun Heo }; 938a3d2615STejun Heo 94e43473b7SVivek Goyal struct throtl_grp { 95f95a04afSTejun Heo /* must be the first member */ 96f95a04afSTejun Heo struct blkg_policy_data pd; 97f95a04afSTejun Heo 98c9e0332eSTejun Heo /* active throtl group service_queue member */ 99e43473b7SVivek Goyal struct rb_node rb_node; 100e43473b7SVivek Goyal 1010f3457f6STejun Heo /* throtl_data this group belongs to */ 1020f3457f6STejun Heo struct throtl_data *td; 1030f3457f6STejun Heo 10449a2f1e3STejun Heo /* this group's service queue */ 10549a2f1e3STejun Heo struct throtl_service_queue service_queue; 10649a2f1e3STejun Heo 107e43473b7SVivek Goyal /* 108*c5cc2070STejun Heo * qnode_on_self is used when bios are directly queued to this 109*c5cc2070STejun Heo * throtl_grp so that local bios compete fairly with bios 110*c5cc2070STejun Heo * dispatched from children. qnode_on_parent is used when bios are 111*c5cc2070STejun Heo * dispatched from this throtl_grp into its parent and will compete 112*c5cc2070STejun Heo * with the sibling qnode_on_parents and the parent's 113*c5cc2070STejun Heo * qnode_on_self. 114*c5cc2070STejun Heo */ 115*c5cc2070STejun Heo struct throtl_qnode qnode_on_self[2]; 116*c5cc2070STejun Heo struct throtl_qnode qnode_on_parent[2]; 117*c5cc2070STejun Heo 118*c5cc2070STejun Heo /* 119e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 120e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 121e43473b7SVivek Goyal * key to sort active groups in service tree. 122e43473b7SVivek Goyal */ 123e43473b7SVivek Goyal unsigned long disptime; 124e43473b7SVivek Goyal 125e43473b7SVivek Goyal unsigned int flags; 126e43473b7SVivek Goyal 127e43473b7SVivek Goyal /* bytes per second rate limits */ 128e43473b7SVivek Goyal uint64_t bps[2]; 129e43473b7SVivek Goyal 1308e89d13fSVivek Goyal /* IOPS limits */ 1318e89d13fSVivek Goyal unsigned int iops[2]; 1328e89d13fSVivek Goyal 133e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 134e43473b7SVivek Goyal uint64_t bytes_disp[2]; 1358e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 1368e89d13fSVivek Goyal unsigned int io_disp[2]; 137e43473b7SVivek Goyal 138e43473b7SVivek Goyal /* When did we start a new slice */ 139e43473b7SVivek Goyal unsigned long slice_start[2]; 140e43473b7SVivek Goyal unsigned long slice_end[2]; 141fe071437SVivek Goyal 1428a3d2615STejun Heo /* Per cpu stats pointer */ 1438a3d2615STejun Heo struct tg_stats_cpu __percpu *stats_cpu; 1448a3d2615STejun Heo 1458a3d2615STejun Heo /* List of tgs waiting for per cpu stats memory to be allocated */ 1468a3d2615STejun Heo struct list_head stats_alloc_node; 147e43473b7SVivek Goyal }; 148e43473b7SVivek Goyal 149e43473b7SVivek Goyal struct throtl_data 150e43473b7SVivek Goyal { 151e43473b7SVivek Goyal /* service tree for active throtl groups */ 152c9e0332eSTejun Heo struct throtl_service_queue service_queue; 153e43473b7SVivek Goyal 154e43473b7SVivek Goyal struct request_queue *queue; 155e43473b7SVivek Goyal 156e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 157e43473b7SVivek Goyal unsigned int nr_queued[2]; 158e43473b7SVivek Goyal 159e43473b7SVivek Goyal /* 16002977e4aSVivek Goyal * number of total undestroyed groups 161e43473b7SVivek Goyal */ 162e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 163e43473b7SVivek Goyal 164e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 16569df0ab0STejun Heo struct work_struct dispatch_work; 166e43473b7SVivek Goyal }; 167e43473b7SVivek Goyal 1688a3d2615STejun Heo /* list and work item to allocate percpu group stats */ 1698a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock); 1708a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list); 1718a3d2615STejun Heo 1728a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *); 1738a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 1748a3d2615STejun Heo 17569df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg); 17669df0ab0STejun Heo 177f95a04afSTejun Heo static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 178f95a04afSTejun Heo { 179f95a04afSTejun Heo return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 180f95a04afSTejun Heo } 181f95a04afSTejun Heo 1823c798398STejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 1830381411eSTejun Heo { 184f95a04afSTejun Heo return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 1850381411eSTejun Heo } 1860381411eSTejun Heo 1873c798398STejun Heo static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 1880381411eSTejun Heo { 189f95a04afSTejun Heo return pd_to_blkg(&tg->pd); 1900381411eSTejun Heo } 1910381411eSTejun Heo 19203d8e111STejun Heo static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 19303d8e111STejun Heo { 19403d8e111STejun Heo return blkg_to_tg(td->queue->root_blkg); 19503d8e111STejun Heo } 19603d8e111STejun Heo 197fda6f272STejun Heo /** 198fda6f272STejun Heo * sq_to_tg - return the throl_grp the specified service queue belongs to 199fda6f272STejun Heo * @sq: the throtl_service_queue of interest 200fda6f272STejun Heo * 201fda6f272STejun Heo * Return the throtl_grp @sq belongs to. If @sq is the top-level one 202fda6f272STejun Heo * embedded in throtl_data, %NULL is returned. 203fda6f272STejun Heo */ 204fda6f272STejun Heo static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 205fda6f272STejun Heo { 206fda6f272STejun Heo if (sq && sq->parent_sq) 207fda6f272STejun Heo return container_of(sq, struct throtl_grp, service_queue); 208fda6f272STejun Heo else 209fda6f272STejun Heo return NULL; 210fda6f272STejun Heo } 211fda6f272STejun Heo 212fda6f272STejun Heo /** 213fda6f272STejun Heo * sq_to_td - return throtl_data the specified service queue belongs to 214fda6f272STejun Heo * @sq: the throtl_service_queue of interest 215fda6f272STejun Heo * 216fda6f272STejun Heo * A service_queue can be embeded in either a throtl_grp or throtl_data. 217fda6f272STejun Heo * Determine the associated throtl_data accordingly and return it. 218fda6f272STejun Heo */ 219fda6f272STejun Heo static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 220fda6f272STejun Heo { 221fda6f272STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 222fda6f272STejun Heo 223fda6f272STejun Heo if (tg) 224fda6f272STejun Heo return tg->td; 225fda6f272STejun Heo else 226fda6f272STejun Heo return container_of(sq, struct throtl_data, service_queue); 227fda6f272STejun Heo } 228fda6f272STejun Heo 229fda6f272STejun Heo /** 230fda6f272STejun Heo * throtl_log - log debug message via blktrace 231fda6f272STejun Heo * @sq: the service_queue being reported 232fda6f272STejun Heo * @fmt: printf format string 233fda6f272STejun Heo * @args: printf args 234fda6f272STejun Heo * 235fda6f272STejun Heo * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 236fda6f272STejun Heo * throtl_grp; otherwise, just "throtl". 237fda6f272STejun Heo * 238fda6f272STejun Heo * TODO: this should be made a function and name formatting should happen 239fda6f272STejun Heo * after testing whether blktrace is enabled. 240fda6f272STejun Heo */ 241fda6f272STejun Heo #define throtl_log(sq, fmt, args...) do { \ 242fda6f272STejun Heo struct throtl_grp *__tg = sq_to_tg((sq)); \ 243fda6f272STejun Heo struct throtl_data *__td = sq_to_td((sq)); \ 244fda6f272STejun Heo \ 245fda6f272STejun Heo (void)__td; \ 246fda6f272STejun Heo if ((__tg)) { \ 24754e7ed12STejun Heo char __pbuf[128]; \ 24854e7ed12STejun Heo \ 249fda6f272STejun Heo blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ 250fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ 251fda6f272STejun Heo } else { \ 252fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 253fda6f272STejun Heo } \ 25454e7ed12STejun Heo } while (0) 255e43473b7SVivek Goyal 2568a3d2615STejun Heo /* 2578a3d2615STejun Heo * Worker for allocating per cpu stat for tgs. This is scheduled on the 2583b07e9caSTejun Heo * system_wq once there are some groups on the alloc_list waiting for 2598a3d2615STejun Heo * allocation. 2608a3d2615STejun Heo */ 2618a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work) 2628a3d2615STejun Heo { 2638a3d2615STejun Heo static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 2648a3d2615STejun Heo struct delayed_work *dwork = to_delayed_work(work); 2658a3d2615STejun Heo bool empty = false; 2668a3d2615STejun Heo 2678a3d2615STejun Heo alloc_stats: 2688a3d2615STejun Heo if (!stats_cpu) { 2698a3d2615STejun Heo stats_cpu = alloc_percpu(struct tg_stats_cpu); 2708a3d2615STejun Heo if (!stats_cpu) { 2718a3d2615STejun Heo /* allocation failed, try again after some time */ 2723b07e9caSTejun Heo schedule_delayed_work(dwork, msecs_to_jiffies(10)); 2738a3d2615STejun Heo return; 2748a3d2615STejun Heo } 2758a3d2615STejun Heo } 2768a3d2615STejun Heo 2778a3d2615STejun Heo spin_lock_irq(&tg_stats_alloc_lock); 2788a3d2615STejun Heo 2798a3d2615STejun Heo if (!list_empty(&tg_stats_alloc_list)) { 2808a3d2615STejun Heo struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 2818a3d2615STejun Heo struct throtl_grp, 2828a3d2615STejun Heo stats_alloc_node); 2838a3d2615STejun Heo swap(tg->stats_cpu, stats_cpu); 2848a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 2858a3d2615STejun Heo } 2868a3d2615STejun Heo 2878a3d2615STejun Heo empty = list_empty(&tg_stats_alloc_list); 2888a3d2615STejun Heo spin_unlock_irq(&tg_stats_alloc_lock); 2898a3d2615STejun Heo if (!empty) 2908a3d2615STejun Heo goto alloc_stats; 2918a3d2615STejun Heo } 2928a3d2615STejun Heo 293*c5cc2070STejun Heo static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 294*c5cc2070STejun Heo { 295*c5cc2070STejun Heo INIT_LIST_HEAD(&qn->node); 296*c5cc2070STejun Heo bio_list_init(&qn->bios); 297*c5cc2070STejun Heo qn->tg = tg; 298*c5cc2070STejun Heo } 299*c5cc2070STejun Heo 300*c5cc2070STejun Heo /** 301*c5cc2070STejun Heo * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 302*c5cc2070STejun Heo * @bio: bio being added 303*c5cc2070STejun Heo * @qn: qnode to add bio to 304*c5cc2070STejun Heo * @queued: the service_queue->queued[] list @qn belongs to 305*c5cc2070STejun Heo * 306*c5cc2070STejun Heo * Add @bio to @qn and put @qn on @queued if it's not already on. 307*c5cc2070STejun Heo * @qn->tg's reference count is bumped when @qn is activated. See the 308*c5cc2070STejun Heo * comment on top of throtl_qnode definition for details. 309*c5cc2070STejun Heo */ 310*c5cc2070STejun Heo static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 311*c5cc2070STejun Heo struct list_head *queued) 312*c5cc2070STejun Heo { 313*c5cc2070STejun Heo bio_list_add(&qn->bios, bio); 314*c5cc2070STejun Heo if (list_empty(&qn->node)) { 315*c5cc2070STejun Heo list_add_tail(&qn->node, queued); 316*c5cc2070STejun Heo blkg_get(tg_to_blkg(qn->tg)); 317*c5cc2070STejun Heo } 318*c5cc2070STejun Heo } 319*c5cc2070STejun Heo 320*c5cc2070STejun Heo /** 321*c5cc2070STejun Heo * throtl_peek_queued - peek the first bio on a qnode list 322*c5cc2070STejun Heo * @queued: the qnode list to peek 323*c5cc2070STejun Heo */ 324*c5cc2070STejun Heo static struct bio *throtl_peek_queued(struct list_head *queued) 325*c5cc2070STejun Heo { 326*c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 327*c5cc2070STejun Heo struct bio *bio; 328*c5cc2070STejun Heo 329*c5cc2070STejun Heo if (list_empty(queued)) 330*c5cc2070STejun Heo return NULL; 331*c5cc2070STejun Heo 332*c5cc2070STejun Heo bio = bio_list_peek(&qn->bios); 333*c5cc2070STejun Heo WARN_ON_ONCE(!bio); 334*c5cc2070STejun Heo return bio; 335*c5cc2070STejun Heo } 336*c5cc2070STejun Heo 337*c5cc2070STejun Heo /** 338*c5cc2070STejun Heo * throtl_pop_queued - pop the first bio form a qnode list 339*c5cc2070STejun Heo * @queued: the qnode list to pop a bio from 340*c5cc2070STejun Heo * @tg_to_put: optional out argument for throtl_grp to put 341*c5cc2070STejun Heo * 342*c5cc2070STejun Heo * Pop the first bio from the qnode list @queued. After popping, the first 343*c5cc2070STejun Heo * qnode is removed from @queued if empty or moved to the end of @queued so 344*c5cc2070STejun Heo * that the popping order is round-robin. 345*c5cc2070STejun Heo * 346*c5cc2070STejun Heo * When the first qnode is removed, its associated throtl_grp should be put 347*c5cc2070STejun Heo * too. If @tg_to_put is NULL, this function automatically puts it; 348*c5cc2070STejun Heo * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 349*c5cc2070STejun Heo * responsible for putting it. 350*c5cc2070STejun Heo */ 351*c5cc2070STejun Heo static struct bio *throtl_pop_queued(struct list_head *queued, 352*c5cc2070STejun Heo struct throtl_grp **tg_to_put) 353*c5cc2070STejun Heo { 354*c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 355*c5cc2070STejun Heo struct bio *bio; 356*c5cc2070STejun Heo 357*c5cc2070STejun Heo if (list_empty(queued)) 358*c5cc2070STejun Heo return NULL; 359*c5cc2070STejun Heo 360*c5cc2070STejun Heo bio = bio_list_pop(&qn->bios); 361*c5cc2070STejun Heo WARN_ON_ONCE(!bio); 362*c5cc2070STejun Heo 363*c5cc2070STejun Heo if (bio_list_empty(&qn->bios)) { 364*c5cc2070STejun Heo list_del_init(&qn->node); 365*c5cc2070STejun Heo if (tg_to_put) 366*c5cc2070STejun Heo *tg_to_put = qn->tg; 367*c5cc2070STejun Heo else 368*c5cc2070STejun Heo blkg_put(tg_to_blkg(qn->tg)); 369*c5cc2070STejun Heo } else { 370*c5cc2070STejun Heo list_move_tail(&qn->node, queued); 371*c5cc2070STejun Heo } 372*c5cc2070STejun Heo 373*c5cc2070STejun Heo return bio; 374*c5cc2070STejun Heo } 375*c5cc2070STejun Heo 37649a2f1e3STejun Heo /* init a service_queue, assumes the caller zeroed it */ 37777216b04STejun Heo static void throtl_service_queue_init(struct throtl_service_queue *sq, 37877216b04STejun Heo struct throtl_service_queue *parent_sq) 37949a2f1e3STejun Heo { 380*c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[0]); 381*c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[1]); 38249a2f1e3STejun Heo sq->pending_tree = RB_ROOT; 38377216b04STejun Heo sq->parent_sq = parent_sq; 38469df0ab0STejun Heo setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 38569df0ab0STejun Heo (unsigned long)sq); 38669df0ab0STejun Heo } 38769df0ab0STejun Heo 38869df0ab0STejun Heo static void throtl_service_queue_exit(struct throtl_service_queue *sq) 38969df0ab0STejun Heo { 39069df0ab0STejun Heo del_timer_sync(&sq->pending_timer); 39149a2f1e3STejun Heo } 39249a2f1e3STejun Heo 3933c798398STejun Heo static void throtl_pd_init(struct blkcg_gq *blkg) 394a29a171eSVivek Goyal { 3950381411eSTejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 39677216b04STejun Heo struct throtl_data *td = blkg->q->td; 397ff26eaadSTejun Heo unsigned long flags; 398*c5cc2070STejun Heo int rw; 399cd1604faSTejun Heo 40077216b04STejun Heo throtl_service_queue_init(&tg->service_queue, &td->service_queue); 401*c5cc2070STejun Heo for (rw = READ; rw <= WRITE; rw++) { 402*c5cc2070STejun Heo throtl_qnode_init(&tg->qnode_on_self[rw], tg); 403*c5cc2070STejun Heo throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 404*c5cc2070STejun Heo } 405*c5cc2070STejun Heo 406a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 40777216b04STejun Heo tg->td = td; 408a29a171eSVivek Goyal 409e56da7e2STejun Heo tg->bps[READ] = -1; 410e56da7e2STejun Heo tg->bps[WRITE] = -1; 411e56da7e2STejun Heo tg->iops[READ] = -1; 412e56da7e2STejun Heo tg->iops[WRITE] = -1; 4138a3d2615STejun Heo 4148a3d2615STejun Heo /* 4158a3d2615STejun Heo * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 4168a3d2615STejun Heo * but percpu allocator can't be called from IO path. Queue tg on 4178a3d2615STejun Heo * tg_stats_alloc_list and allocate from work item. 4188a3d2615STejun Heo */ 419ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 4208a3d2615STejun Heo list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 4213b07e9caSTejun Heo schedule_delayed_work(&tg_stats_alloc_work, 0); 422ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 4238a3d2615STejun Heo } 4248a3d2615STejun Heo 4253c798398STejun Heo static void throtl_pd_exit(struct blkcg_gq *blkg) 4268a3d2615STejun Heo { 4278a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 428ff26eaadSTejun Heo unsigned long flags; 4298a3d2615STejun Heo 430ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 4318a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 432ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 4338a3d2615STejun Heo 4348a3d2615STejun Heo free_percpu(tg->stats_cpu); 43569df0ab0STejun Heo 43669df0ab0STejun Heo throtl_service_queue_exit(&tg->service_queue); 4378a3d2615STejun Heo } 4388a3d2615STejun Heo 4393c798398STejun Heo static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 4408a3d2615STejun Heo { 4418a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 4428a3d2615STejun Heo int cpu; 4438a3d2615STejun Heo 4448a3d2615STejun Heo if (tg->stats_cpu == NULL) 4458a3d2615STejun Heo return; 4468a3d2615STejun Heo 4478a3d2615STejun Heo for_each_possible_cpu(cpu) { 4488a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 4498a3d2615STejun Heo 4508a3d2615STejun Heo blkg_rwstat_reset(&sc->service_bytes); 4518a3d2615STejun Heo blkg_rwstat_reset(&sc->serviced); 4528a3d2615STejun Heo } 453a29a171eSVivek Goyal } 454a29a171eSVivek Goyal 4553c798398STejun Heo static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 4563c798398STejun Heo struct blkcg *blkcg) 457e43473b7SVivek Goyal { 458e43473b7SVivek Goyal /* 4593c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 4603c798398STejun Heo * in this case 461be2c6b19SVivek Goyal */ 4623c798398STejun Heo if (blkcg == &blkcg_root) 46303d8e111STejun Heo return td_root_tg(td); 464e43473b7SVivek Goyal 465e8989faeSTejun Heo return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 466e43473b7SVivek Goyal } 467e43473b7SVivek Goyal 468cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 4693c798398STejun Heo struct blkcg *blkcg) 470e43473b7SVivek Goyal { 471f469a7b4SVivek Goyal struct request_queue *q = td->queue; 472cd1604faSTejun Heo struct throtl_grp *tg = NULL; 4730a5a7d0eSTejun Heo 474f469a7b4SVivek Goyal /* 4753c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 4763c798398STejun Heo * in this case 477f469a7b4SVivek Goyal */ 4783c798398STejun Heo if (blkcg == &blkcg_root) { 47903d8e111STejun Heo tg = td_root_tg(td); 480cd1604faSTejun Heo } else { 4813c798398STejun Heo struct blkcg_gq *blkg; 482cd1604faSTejun Heo 4833c96cb32STejun Heo blkg = blkg_lookup_create(blkcg, q); 484cd1604faSTejun Heo 485cd1604faSTejun Heo /* if %NULL and @q is alive, fall back to root_tg */ 486cd1604faSTejun Heo if (!IS_ERR(blkg)) 4870381411eSTejun Heo tg = blkg_to_tg(blkg); 4883f3299d5SBart Van Assche else if (!blk_queue_dying(q)) 48903d8e111STejun Heo tg = td_root_tg(td); 490f469a7b4SVivek Goyal } 491f469a7b4SVivek Goyal 492e43473b7SVivek Goyal return tg; 493e43473b7SVivek Goyal } 494e43473b7SVivek Goyal 4950049af73STejun Heo static struct throtl_grp * 4960049af73STejun Heo throtl_rb_first(struct throtl_service_queue *parent_sq) 497e43473b7SVivek Goyal { 498e43473b7SVivek Goyal /* Service tree is empty */ 4990049af73STejun Heo if (!parent_sq->nr_pending) 500e43473b7SVivek Goyal return NULL; 501e43473b7SVivek Goyal 5020049af73STejun Heo if (!parent_sq->first_pending) 5030049af73STejun Heo parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 504e43473b7SVivek Goyal 5050049af73STejun Heo if (parent_sq->first_pending) 5060049af73STejun Heo return rb_entry_tg(parent_sq->first_pending); 507e43473b7SVivek Goyal 508e43473b7SVivek Goyal return NULL; 509e43473b7SVivek Goyal } 510e43473b7SVivek Goyal 511e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 512e43473b7SVivek Goyal { 513e43473b7SVivek Goyal rb_erase(n, root); 514e43473b7SVivek Goyal RB_CLEAR_NODE(n); 515e43473b7SVivek Goyal } 516e43473b7SVivek Goyal 5170049af73STejun Heo static void throtl_rb_erase(struct rb_node *n, 5180049af73STejun Heo struct throtl_service_queue *parent_sq) 519e43473b7SVivek Goyal { 5200049af73STejun Heo if (parent_sq->first_pending == n) 5210049af73STejun Heo parent_sq->first_pending = NULL; 5220049af73STejun Heo rb_erase_init(n, &parent_sq->pending_tree); 5230049af73STejun Heo --parent_sq->nr_pending; 524e43473b7SVivek Goyal } 525e43473b7SVivek Goyal 5260049af73STejun Heo static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 527e43473b7SVivek Goyal { 528e43473b7SVivek Goyal struct throtl_grp *tg; 529e43473b7SVivek Goyal 5300049af73STejun Heo tg = throtl_rb_first(parent_sq); 531e43473b7SVivek Goyal if (!tg) 532e43473b7SVivek Goyal return; 533e43473b7SVivek Goyal 5340049af73STejun Heo parent_sq->first_pending_disptime = tg->disptime; 535e43473b7SVivek Goyal } 536e43473b7SVivek Goyal 53777216b04STejun Heo static void tg_service_queue_add(struct throtl_grp *tg) 538e43473b7SVivek Goyal { 53977216b04STejun Heo struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 5400049af73STejun Heo struct rb_node **node = &parent_sq->pending_tree.rb_node; 541e43473b7SVivek Goyal struct rb_node *parent = NULL; 542e43473b7SVivek Goyal struct throtl_grp *__tg; 543e43473b7SVivek Goyal unsigned long key = tg->disptime; 544e43473b7SVivek Goyal int left = 1; 545e43473b7SVivek Goyal 546e43473b7SVivek Goyal while (*node != NULL) { 547e43473b7SVivek Goyal parent = *node; 548e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 549e43473b7SVivek Goyal 550e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 551e43473b7SVivek Goyal node = &parent->rb_left; 552e43473b7SVivek Goyal else { 553e43473b7SVivek Goyal node = &parent->rb_right; 554e43473b7SVivek Goyal left = 0; 555e43473b7SVivek Goyal } 556e43473b7SVivek Goyal } 557e43473b7SVivek Goyal 558e43473b7SVivek Goyal if (left) 5590049af73STejun Heo parent_sq->first_pending = &tg->rb_node; 560e43473b7SVivek Goyal 561e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 5620049af73STejun Heo rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 563e43473b7SVivek Goyal } 564e43473b7SVivek Goyal 56577216b04STejun Heo static void __throtl_enqueue_tg(struct throtl_grp *tg) 566e43473b7SVivek Goyal { 56777216b04STejun Heo tg_service_queue_add(tg); 5685b2c16aaSTejun Heo tg->flags |= THROTL_TG_PENDING; 56977216b04STejun Heo tg->service_queue.parent_sq->nr_pending++; 570e43473b7SVivek Goyal } 571e43473b7SVivek Goyal 57277216b04STejun Heo static void throtl_enqueue_tg(struct throtl_grp *tg) 573e43473b7SVivek Goyal { 5745b2c16aaSTejun Heo if (!(tg->flags & THROTL_TG_PENDING)) 57577216b04STejun Heo __throtl_enqueue_tg(tg); 576e43473b7SVivek Goyal } 577e43473b7SVivek Goyal 57877216b04STejun Heo static void __throtl_dequeue_tg(struct throtl_grp *tg) 579e43473b7SVivek Goyal { 58077216b04STejun Heo throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 5815b2c16aaSTejun Heo tg->flags &= ~THROTL_TG_PENDING; 582e43473b7SVivek Goyal } 583e43473b7SVivek Goyal 58477216b04STejun Heo static void throtl_dequeue_tg(struct throtl_grp *tg) 585e43473b7SVivek Goyal { 5865b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) 58777216b04STejun Heo __throtl_dequeue_tg(tg); 588e43473b7SVivek Goyal } 589e43473b7SVivek Goyal 590a9131a27STejun Heo /* Call with queue lock held */ 59169df0ab0STejun Heo static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 59269df0ab0STejun Heo unsigned long expires) 593a9131a27STejun Heo { 59469df0ab0STejun Heo mod_timer(&sq->pending_timer, expires); 59569df0ab0STejun Heo throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 59669df0ab0STejun Heo expires - jiffies, jiffies); 597a9131a27STejun Heo } 598a9131a27STejun Heo 5997f52f98cSTejun Heo /** 6007f52f98cSTejun Heo * throtl_schedule_next_dispatch - schedule the next dispatch cycle 6017f52f98cSTejun Heo * @sq: the service_queue to schedule dispatch for 6027f52f98cSTejun Heo * @force: force scheduling 6037f52f98cSTejun Heo * 6047f52f98cSTejun Heo * Arm @sq->pending_timer so that the next dispatch cycle starts on the 6057f52f98cSTejun Heo * dispatch time of the first pending child. Returns %true if either timer 6067f52f98cSTejun Heo * is armed or there's no pending child left. %false if the current 6077f52f98cSTejun Heo * dispatch window is still open and the caller should continue 6087f52f98cSTejun Heo * dispatching. 6097f52f98cSTejun Heo * 6107f52f98cSTejun Heo * If @force is %true, the dispatch timer is always scheduled and this 6117f52f98cSTejun Heo * function is guaranteed to return %true. This is to be used when the 6127f52f98cSTejun Heo * caller can't dispatch itself and needs to invoke pending_timer 6137f52f98cSTejun Heo * unconditionally. Note that forced scheduling is likely to induce short 6147f52f98cSTejun Heo * delay before dispatch starts even if @sq->first_pending_disptime is not 6157f52f98cSTejun Heo * in the future and thus shouldn't be used in hot paths. 6167f52f98cSTejun Heo */ 6177f52f98cSTejun Heo static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 6187f52f98cSTejun Heo bool force) 619e43473b7SVivek Goyal { 6206a525600STejun Heo /* any pending children left? */ 621c9e0332eSTejun Heo if (!sq->nr_pending) 6227f52f98cSTejun Heo return true; 623e43473b7SVivek Goyal 624c9e0332eSTejun Heo update_min_dispatch_time(sq); 625e43473b7SVivek Goyal 62669df0ab0STejun Heo /* is the next dispatch time in the future? */ 6277f52f98cSTejun Heo if (force || time_after(sq->first_pending_disptime, jiffies)) { 62869df0ab0STejun Heo throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 6297f52f98cSTejun Heo return true; 63069df0ab0STejun Heo } 63169df0ab0STejun Heo 6327f52f98cSTejun Heo /* tell the caller to continue dispatching */ 6337f52f98cSTejun Heo return false; 634e43473b7SVivek Goyal } 635e43473b7SVivek Goyal 6360f3457f6STejun Heo static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 637e43473b7SVivek Goyal { 638e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 6398e89d13fSVivek Goyal tg->io_disp[rw] = 0; 640e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 641e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 642fda6f272STejun Heo throtl_log(&tg->service_queue, 643fda6f272STejun Heo "[%c] new slice start=%lu end=%lu jiffies=%lu", 644e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 645e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 646e43473b7SVivek Goyal } 647e43473b7SVivek Goyal 6480f3457f6STejun Heo static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 6490f3457f6STejun Heo unsigned long jiffy_end) 650d1ae8ffdSVivek Goyal { 651d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 652d1ae8ffdSVivek Goyal } 653d1ae8ffdSVivek Goyal 6540f3457f6STejun Heo static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 6550f3457f6STejun Heo unsigned long jiffy_end) 656e43473b7SVivek Goyal { 657e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 658fda6f272STejun Heo throtl_log(&tg->service_queue, 659fda6f272STejun Heo "[%c] extend slice start=%lu end=%lu jiffies=%lu", 660e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 661e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 662e43473b7SVivek Goyal } 663e43473b7SVivek Goyal 664e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 6650f3457f6STejun Heo static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 666e43473b7SVivek Goyal { 667e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 668e43473b7SVivek Goyal return 0; 669e43473b7SVivek Goyal 670e43473b7SVivek Goyal return 1; 671e43473b7SVivek Goyal } 672e43473b7SVivek Goyal 673e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 6740f3457f6STejun Heo static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 675e43473b7SVivek Goyal { 6763aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 6773aad5d3eSVivek Goyal u64 bytes_trim, tmp; 678e43473b7SVivek Goyal 679e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 680e43473b7SVivek Goyal 681e43473b7SVivek Goyal /* 682e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 683e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 684e43473b7SVivek Goyal * slice will start when appropriate. 685e43473b7SVivek Goyal */ 6860f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 687e43473b7SVivek Goyal return; 688e43473b7SVivek Goyal 689d1ae8ffdSVivek Goyal /* 690d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 691d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 692d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 693d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 694d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 695d1ae8ffdSVivek Goyal */ 696d1ae8ffdSVivek Goyal 6970f3457f6STejun Heo throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 698d1ae8ffdSVivek Goyal 699e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 700e43473b7SVivek Goyal 701e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 702e43473b7SVivek Goyal 703e43473b7SVivek Goyal if (!nr_slices) 704e43473b7SVivek Goyal return; 7053aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 7063aad5d3eSVivek Goyal do_div(tmp, HZ); 7073aad5d3eSVivek Goyal bytes_trim = tmp; 708e43473b7SVivek Goyal 7098e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 710e43473b7SVivek Goyal 7118e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 712e43473b7SVivek Goyal return; 713e43473b7SVivek Goyal 714e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 715e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 716e43473b7SVivek Goyal else 717e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 718e43473b7SVivek Goyal 7198e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 7208e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 7218e89d13fSVivek Goyal else 7228e89d13fSVivek Goyal tg->io_disp[rw] = 0; 7238e89d13fSVivek Goyal 724e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 725e43473b7SVivek Goyal 726fda6f272STejun Heo throtl_log(&tg->service_queue, 727fda6f272STejun Heo "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 7288e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 729e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 730e43473b7SVivek Goyal } 731e43473b7SVivek Goyal 7320f3457f6STejun Heo static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 7330f3457f6STejun Heo unsigned long *wait) 734e43473b7SVivek Goyal { 735e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 7368e89d13fSVivek Goyal unsigned int io_allowed; 737e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 738c49c06e4SVivek Goyal u64 tmp; 739e43473b7SVivek Goyal 7408e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 741e43473b7SVivek Goyal 7428e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 7438e89d13fSVivek Goyal if (!jiffy_elapsed) 7448e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 7458e89d13fSVivek Goyal 7468e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 7478e89d13fSVivek Goyal 748c49c06e4SVivek Goyal /* 749c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 750c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 751c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 752c49c06e4SVivek Goyal * have been trimmed. 753c49c06e4SVivek Goyal */ 754c49c06e4SVivek Goyal 755c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 756c49c06e4SVivek Goyal do_div(tmp, HZ); 757c49c06e4SVivek Goyal 758c49c06e4SVivek Goyal if (tmp > UINT_MAX) 759c49c06e4SVivek Goyal io_allowed = UINT_MAX; 760c49c06e4SVivek Goyal else 761c49c06e4SVivek Goyal io_allowed = tmp; 7628e89d13fSVivek Goyal 7638e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 764e43473b7SVivek Goyal if (wait) 765e43473b7SVivek Goyal *wait = 0; 766e43473b7SVivek Goyal return 1; 767e43473b7SVivek Goyal } 768e43473b7SVivek Goyal 7698e89d13fSVivek Goyal /* Calc approx time to dispatch */ 7708e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 7718e89d13fSVivek Goyal 7728e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 7738e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 7748e89d13fSVivek Goyal else 7758e89d13fSVivek Goyal jiffy_wait = 1; 7768e89d13fSVivek Goyal 7778e89d13fSVivek Goyal if (wait) 7788e89d13fSVivek Goyal *wait = jiffy_wait; 7798e89d13fSVivek Goyal return 0; 780e43473b7SVivek Goyal } 781e43473b7SVivek Goyal 7820f3457f6STejun Heo static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 7830f3457f6STejun Heo unsigned long *wait) 7848e89d13fSVivek Goyal { 7858e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 7863aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 7878e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 7888e89d13fSVivek Goyal 789e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 790e43473b7SVivek Goyal 791e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 792e43473b7SVivek Goyal if (!jiffy_elapsed) 793e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 794e43473b7SVivek Goyal 795e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 796e43473b7SVivek Goyal 7975e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 7985e901a2bSVivek Goyal do_div(tmp, HZ); 7993aad5d3eSVivek Goyal bytes_allowed = tmp; 800e43473b7SVivek Goyal 801e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 802e43473b7SVivek Goyal if (wait) 803e43473b7SVivek Goyal *wait = 0; 804e43473b7SVivek Goyal return 1; 805e43473b7SVivek Goyal } 806e43473b7SVivek Goyal 807e43473b7SVivek Goyal /* Calc approx time to dispatch */ 808e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 809e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 810e43473b7SVivek Goyal 811e43473b7SVivek Goyal if (!jiffy_wait) 812e43473b7SVivek Goyal jiffy_wait = 1; 813e43473b7SVivek Goyal 814e43473b7SVivek Goyal /* 815e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 816e43473b7SVivek Goyal * up we did. Add that time also. 817e43473b7SVivek Goyal */ 818e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 819e43473b7SVivek Goyal if (wait) 820e43473b7SVivek Goyal *wait = jiffy_wait; 8218e89d13fSVivek Goyal return 0; 8228e89d13fSVivek Goyal } 823e43473b7SVivek Goyal 824af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 825af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 826af75cd3cSVivek Goyal return 1; 827af75cd3cSVivek Goyal return 0; 828af75cd3cSVivek Goyal } 829af75cd3cSVivek Goyal 8308e89d13fSVivek Goyal /* 8318e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 8328e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 8338e89d13fSVivek Goyal */ 8340f3457f6STejun Heo static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 8350f3457f6STejun Heo unsigned long *wait) 8368e89d13fSVivek Goyal { 8378e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 8388e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 8398e89d13fSVivek Goyal 8408e89d13fSVivek Goyal /* 8418e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 8428e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 8438e89d13fSVivek Goyal * this function with a different bio if there are other bios 8448e89d13fSVivek Goyal * queued. 8458e89d13fSVivek Goyal */ 84673f0d49aSTejun Heo BUG_ON(tg->service_queue.nr_queued[rw] && 847*c5cc2070STejun Heo bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 8488e89d13fSVivek Goyal 8498e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 8508e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 8518e89d13fSVivek Goyal if (wait) 8528e89d13fSVivek Goyal *wait = 0; 8538e89d13fSVivek Goyal return 1; 8548e89d13fSVivek Goyal } 8558e89d13fSVivek Goyal 8568e89d13fSVivek Goyal /* 8578e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 8588e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 8598e89d13fSVivek Goyal * long since now. 8608e89d13fSVivek Goyal */ 8610f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 8620f3457f6STejun Heo throtl_start_new_slice(tg, rw); 8638e89d13fSVivek Goyal else { 8648e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 8650f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + throtl_slice); 8668e89d13fSVivek Goyal } 8678e89d13fSVivek Goyal 8680f3457f6STejun Heo if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 8690f3457f6STejun Heo tg_with_in_iops_limit(tg, bio, &iops_wait)) { 8708e89d13fSVivek Goyal if (wait) 8718e89d13fSVivek Goyal *wait = 0; 8728e89d13fSVivek Goyal return 1; 8738e89d13fSVivek Goyal } 8748e89d13fSVivek Goyal 8758e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 8768e89d13fSVivek Goyal 8778e89d13fSVivek Goyal if (wait) 8788e89d13fSVivek Goyal *wait = max_wait; 8798e89d13fSVivek Goyal 8808e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 8810f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + max_wait); 882e43473b7SVivek Goyal 883e43473b7SVivek Goyal return 0; 884e43473b7SVivek Goyal } 885e43473b7SVivek Goyal 8863c798398STejun Heo static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, 887629ed0b1STejun Heo int rw) 888629ed0b1STejun Heo { 8898a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 8908a3d2615STejun Heo struct tg_stats_cpu *stats_cpu; 891629ed0b1STejun Heo unsigned long flags; 892629ed0b1STejun Heo 893629ed0b1STejun Heo /* If per cpu stats are not allocated yet, don't do any accounting. */ 8948a3d2615STejun Heo if (tg->stats_cpu == NULL) 895629ed0b1STejun Heo return; 896629ed0b1STejun Heo 897629ed0b1STejun Heo /* 898629ed0b1STejun Heo * Disabling interrupts to provide mutual exclusion between two 899629ed0b1STejun Heo * writes on same cpu. It probably is not needed for 64bit. Not 900629ed0b1STejun Heo * optimizing that case yet. 901629ed0b1STejun Heo */ 902629ed0b1STejun Heo local_irq_save(flags); 903629ed0b1STejun Heo 9048a3d2615STejun Heo stats_cpu = this_cpu_ptr(tg->stats_cpu); 905629ed0b1STejun Heo 906629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 907629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 908629ed0b1STejun Heo 909629ed0b1STejun Heo local_irq_restore(flags); 910629ed0b1STejun Heo } 911629ed0b1STejun Heo 912e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 913e43473b7SVivek Goyal { 914e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 915e43473b7SVivek Goyal 916e43473b7SVivek Goyal /* Charge the bio to the group */ 917e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 9188e89d13fSVivek Goyal tg->io_disp[rw]++; 919e43473b7SVivek Goyal 9202a0f61e6STejun Heo /* 9212a0f61e6STejun Heo * REQ_THROTTLED is used to prevent the same bio to be throttled 9222a0f61e6STejun Heo * more than once as a throttled bio will go through blk-throtl the 9232a0f61e6STejun Heo * second time when it eventually gets issued. Set it when a bio 9242a0f61e6STejun Heo * is being charged to a tg. 9252a0f61e6STejun Heo * 9262a0f61e6STejun Heo * Dispatch stats aren't recursive and each @bio should only be 9272a0f61e6STejun Heo * accounted by the @tg it was originally associated with. Let's 9282a0f61e6STejun Heo * update the stats when setting REQ_THROTTLED for the first time 9292a0f61e6STejun Heo * which is guaranteed to be for the @bio's original tg. 9302a0f61e6STejun Heo */ 9312a0f61e6STejun Heo if (!(bio->bi_rw & REQ_THROTTLED)) { 9322a0f61e6STejun Heo bio->bi_rw |= REQ_THROTTLED; 9332a0f61e6STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, 9342a0f61e6STejun Heo bio->bi_rw); 9352a0f61e6STejun Heo } 936e43473b7SVivek Goyal } 937e43473b7SVivek Goyal 938*c5cc2070STejun Heo /** 939*c5cc2070STejun Heo * throtl_add_bio_tg - add a bio to the specified throtl_grp 940*c5cc2070STejun Heo * @bio: bio to add 941*c5cc2070STejun Heo * @qn: qnode to use 942*c5cc2070STejun Heo * @tg: the target throtl_grp 943*c5cc2070STejun Heo * 944*c5cc2070STejun Heo * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 945*c5cc2070STejun Heo * tg->qnode_on_self[] is used. 946*c5cc2070STejun Heo */ 947*c5cc2070STejun Heo static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 948*c5cc2070STejun Heo struct throtl_grp *tg) 949e43473b7SVivek Goyal { 95073f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 951e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 952e43473b7SVivek Goyal 953*c5cc2070STejun Heo if (!qn) 954*c5cc2070STejun Heo qn = &tg->qnode_on_self[rw]; 955*c5cc2070STejun Heo 9560e9f4164STejun Heo /* 9570e9f4164STejun Heo * If @tg doesn't currently have any bios queued in the same 9580e9f4164STejun Heo * direction, queueing @bio can change when @tg should be 9590e9f4164STejun Heo * dispatched. Mark that @tg was empty. This is automatically 9600e9f4164STejun Heo * cleaered on the next tg_update_disptime(). 9610e9f4164STejun Heo */ 9620e9f4164STejun Heo if (!sq->nr_queued[rw]) 9630e9f4164STejun Heo tg->flags |= THROTL_TG_WAS_EMPTY; 9640e9f4164STejun Heo 965*c5cc2070STejun Heo throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 966*c5cc2070STejun Heo 96773f0d49aSTejun Heo sq->nr_queued[rw]++; 96877216b04STejun Heo throtl_enqueue_tg(tg); 969e43473b7SVivek Goyal } 970e43473b7SVivek Goyal 97177216b04STejun Heo static void tg_update_disptime(struct throtl_grp *tg) 972e43473b7SVivek Goyal { 97373f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 974e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 975e43473b7SVivek Goyal struct bio *bio; 976e43473b7SVivek Goyal 977*c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[READ]))) 9780f3457f6STejun Heo tg_may_dispatch(tg, bio, &read_wait); 979e43473b7SVivek Goyal 980*c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 9810f3457f6STejun Heo tg_may_dispatch(tg, bio, &write_wait); 982e43473b7SVivek Goyal 983e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 984e43473b7SVivek Goyal disptime = jiffies + min_wait; 985e43473b7SVivek Goyal 986e43473b7SVivek Goyal /* Update dispatch time */ 98777216b04STejun Heo throtl_dequeue_tg(tg); 988e43473b7SVivek Goyal tg->disptime = disptime; 98977216b04STejun Heo throtl_enqueue_tg(tg); 9900e9f4164STejun Heo 9910e9f4164STejun Heo /* see throtl_add_bio_tg() */ 9920e9f4164STejun Heo tg->flags &= ~THROTL_TG_WAS_EMPTY; 993e43473b7SVivek Goyal } 994e43473b7SVivek Goyal 99577216b04STejun Heo static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 996e43473b7SVivek Goyal { 99773f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 9986bc9c2b4STejun Heo struct throtl_service_queue *parent_sq = sq->parent_sq; 9996bc9c2b4STejun Heo struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1000*c5cc2070STejun Heo struct throtl_grp *tg_to_put = NULL; 1001e43473b7SVivek Goyal struct bio *bio; 1002e43473b7SVivek Goyal 1003*c5cc2070STejun Heo /* 1004*c5cc2070STejun Heo * @bio is being transferred from @tg to @parent_sq. Popping a bio 1005*c5cc2070STejun Heo * from @tg may put its reference and @parent_sq might end up 1006*c5cc2070STejun Heo * getting released prematurely. Remember the tg to put and put it 1007*c5cc2070STejun Heo * after @bio is transferred to @parent_sq. 1008*c5cc2070STejun Heo */ 1009*c5cc2070STejun Heo bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 101073f0d49aSTejun Heo sq->nr_queued[rw]--; 1011e43473b7SVivek Goyal 1012e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 10136bc9c2b4STejun Heo 10146bc9c2b4STejun Heo /* 10156bc9c2b4STejun Heo * If our parent is another tg, we just need to transfer @bio to 10166bc9c2b4STejun Heo * the parent using throtl_add_bio_tg(). If our parent is 10176bc9c2b4STejun Heo * @td->service_queue, @bio is ready to be issued. Put it on its 10186bc9c2b4STejun Heo * bio_lists[] and decrease total number queued. The caller is 10196bc9c2b4STejun Heo * responsible for issuing these bios. 10206bc9c2b4STejun Heo */ 10216bc9c2b4STejun Heo if (parent_tg) { 1022*c5cc2070STejun Heo throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 10236bc9c2b4STejun Heo } else { 1024*c5cc2070STejun Heo throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1025*c5cc2070STejun Heo &parent_sq->queued[rw]); 10266bc9c2b4STejun Heo BUG_ON(tg->td->nr_queued[rw] <= 0); 10276bc9c2b4STejun Heo tg->td->nr_queued[rw]--; 10286bc9c2b4STejun Heo } 1029e43473b7SVivek Goyal 10300f3457f6STejun Heo throtl_trim_slice(tg, rw); 10316bc9c2b4STejun Heo 1032*c5cc2070STejun Heo if (tg_to_put) 1033*c5cc2070STejun Heo blkg_put(tg_to_blkg(tg_to_put)); 1034e43473b7SVivek Goyal } 1035e43473b7SVivek Goyal 103677216b04STejun Heo static int throtl_dispatch_tg(struct throtl_grp *tg) 1037e43473b7SVivek Goyal { 103873f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1039e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 1040e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1041c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1042e43473b7SVivek Goyal struct bio *bio; 1043e43473b7SVivek Goyal 1044e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 1045e43473b7SVivek Goyal 1046*c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ])) && 10470f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 1048e43473b7SVivek Goyal 104977216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1050e43473b7SVivek Goyal nr_reads++; 1051e43473b7SVivek Goyal 1052e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 1053e43473b7SVivek Goyal break; 1054e43473b7SVivek Goyal } 1055e43473b7SVivek Goyal 1056*c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 10570f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 1058e43473b7SVivek Goyal 105977216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1060e43473b7SVivek Goyal nr_writes++; 1061e43473b7SVivek Goyal 1062e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 1063e43473b7SVivek Goyal break; 1064e43473b7SVivek Goyal } 1065e43473b7SVivek Goyal 1066e43473b7SVivek Goyal return nr_reads + nr_writes; 1067e43473b7SVivek Goyal } 1068e43473b7SVivek Goyal 1069651930bcSTejun Heo static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1070e43473b7SVivek Goyal { 1071e43473b7SVivek Goyal unsigned int nr_disp = 0; 1072e43473b7SVivek Goyal 1073e43473b7SVivek Goyal while (1) { 107473f0d49aSTejun Heo struct throtl_grp *tg = throtl_rb_first(parent_sq); 107573f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1076e43473b7SVivek Goyal 1077e43473b7SVivek Goyal if (!tg) 1078e43473b7SVivek Goyal break; 1079e43473b7SVivek Goyal 1080e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 1081e43473b7SVivek Goyal break; 1082e43473b7SVivek Goyal 108377216b04STejun Heo throtl_dequeue_tg(tg); 1084e43473b7SVivek Goyal 108577216b04STejun Heo nr_disp += throtl_dispatch_tg(tg); 1086e43473b7SVivek Goyal 108773f0d49aSTejun Heo if (sq->nr_queued[0] || sq->nr_queued[1]) 108877216b04STejun Heo tg_update_disptime(tg); 1089e43473b7SVivek Goyal 1090e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 1091e43473b7SVivek Goyal break; 1092e43473b7SVivek Goyal } 1093e43473b7SVivek Goyal 1094e43473b7SVivek Goyal return nr_disp; 1095e43473b7SVivek Goyal } 1096e43473b7SVivek Goyal 10976e1a5704STejun Heo /** 10986e1a5704STejun Heo * throtl_pending_timer_fn - timer function for service_queue->pending_timer 10996e1a5704STejun Heo * @arg: the throtl_service_queue being serviced 11006e1a5704STejun Heo * 11016e1a5704STejun Heo * This timer is armed when a child throtl_grp with active bio's become 11026e1a5704STejun Heo * pending and queued on the service_queue's pending_tree and expires when 11036e1a5704STejun Heo * the first child throtl_grp should be dispatched. This function 11042e48a530STejun Heo * dispatches bio's from the children throtl_grps to the parent 11052e48a530STejun Heo * service_queue. 11062e48a530STejun Heo * 11072e48a530STejun Heo * If the parent's parent is another throtl_grp, dispatching is propagated 11082e48a530STejun Heo * by either arming its pending_timer or repeating dispatch directly. If 11092e48a530STejun Heo * the top-level service_tree is reached, throtl_data->dispatch_work is 11102e48a530STejun Heo * kicked so that the ready bio's are issued. 11116e1a5704STejun Heo */ 111269df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg) 111369df0ab0STejun Heo { 111469df0ab0STejun Heo struct throtl_service_queue *sq = (void *)arg; 11152e48a530STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 111669df0ab0STejun Heo struct throtl_data *td = sq_to_td(sq); 1117cb76199cSTejun Heo struct request_queue *q = td->queue; 11182e48a530STejun Heo struct throtl_service_queue *parent_sq; 11192e48a530STejun Heo bool dispatched; 11206e1a5704STejun Heo int ret; 1121e43473b7SVivek Goyal 1122e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 11232e48a530STejun Heo again: 11242e48a530STejun Heo parent_sq = sq->parent_sq; 11252e48a530STejun Heo dispatched = false; 1126e43473b7SVivek Goyal 11277f52f98cSTejun Heo while (true) { 1128fda6f272STejun Heo throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 11292e48a530STejun Heo sq->nr_queued[READ] + sq->nr_queued[WRITE], 11302e48a530STejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1131e43473b7SVivek Goyal 11327f52f98cSTejun Heo ret = throtl_select_dispatch(sq); 11337f52f98cSTejun Heo if (ret) { 11347f52f98cSTejun Heo throtl_log(sq, "bios disp=%u", ret); 11357f52f98cSTejun Heo dispatched = true; 1136651930bcSTejun Heo } 1137e43473b7SVivek Goyal 11387f52f98cSTejun Heo if (throtl_schedule_next_dispatch(sq, false)) 11397f52f98cSTejun Heo break; 11407f52f98cSTejun Heo 11417f52f98cSTejun Heo /* this dispatch windows is still open, relax and repeat */ 11427f52f98cSTejun Heo spin_unlock_irq(q->queue_lock); 11437f52f98cSTejun Heo cpu_relax(); 11447f52f98cSTejun Heo spin_lock_irq(q->queue_lock); 11457f52f98cSTejun Heo } 11466a525600STejun Heo 11472e48a530STejun Heo if (!dispatched) 11482e48a530STejun Heo goto out_unlock; 11496e1a5704STejun Heo 11502e48a530STejun Heo if (parent_sq) { 11512e48a530STejun Heo /* @parent_sq is another throl_grp, propagate dispatch */ 11522e48a530STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 11532e48a530STejun Heo tg_update_disptime(tg); 11542e48a530STejun Heo if (!throtl_schedule_next_dispatch(parent_sq, false)) { 11552e48a530STejun Heo /* window is already open, repeat dispatching */ 11562e48a530STejun Heo sq = parent_sq; 11572e48a530STejun Heo tg = sq_to_tg(sq); 11582e48a530STejun Heo goto again; 11592e48a530STejun Heo } 11602e48a530STejun Heo } 11612e48a530STejun Heo } else { 11622e48a530STejun Heo /* reached the top-level, queue issueing */ 11632e48a530STejun Heo queue_work(kthrotld_workqueue, &td->dispatch_work); 11642e48a530STejun Heo } 11652e48a530STejun Heo out_unlock: 11666e1a5704STejun Heo spin_unlock_irq(q->queue_lock); 11676e1a5704STejun Heo } 11686e1a5704STejun Heo 11696e1a5704STejun Heo /** 11706e1a5704STejun Heo * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 11716e1a5704STejun Heo * @work: work item being executed 11726e1a5704STejun Heo * 11736e1a5704STejun Heo * This function is queued for execution when bio's reach the bio_lists[] 11746e1a5704STejun Heo * of throtl_data->service_queue. Those bio's are ready and issued by this 11756e1a5704STejun Heo * function. 11766e1a5704STejun Heo */ 11776e1a5704STejun Heo void blk_throtl_dispatch_work_fn(struct work_struct *work) 11786e1a5704STejun Heo { 11796e1a5704STejun Heo struct throtl_data *td = container_of(work, struct throtl_data, 11806e1a5704STejun Heo dispatch_work); 11816e1a5704STejun Heo struct throtl_service_queue *td_sq = &td->service_queue; 11826e1a5704STejun Heo struct request_queue *q = td->queue; 11836e1a5704STejun Heo struct bio_list bio_list_on_stack; 11846e1a5704STejun Heo struct bio *bio; 11856e1a5704STejun Heo struct blk_plug plug; 11866e1a5704STejun Heo int rw; 11876e1a5704STejun Heo 11886e1a5704STejun Heo bio_list_init(&bio_list_on_stack); 11896e1a5704STejun Heo 11906e1a5704STejun Heo spin_lock_irq(q->queue_lock); 1191*c5cc2070STejun Heo for (rw = READ; rw <= WRITE; rw++) 1192*c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1193*c5cc2070STejun Heo bio_list_add(&bio_list_on_stack, bio); 1194e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1195e43473b7SVivek Goyal 11966e1a5704STejun Heo if (!bio_list_empty(&bio_list_on_stack)) { 119769d60eb9SVivek Goyal blk_start_plug(&plug); 1198e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 1199e43473b7SVivek Goyal generic_make_request(bio); 120069d60eb9SVivek Goyal blk_finish_plug(&plug); 1201e43473b7SVivek Goyal } 1202e43473b7SVivek Goyal } 1203e43473b7SVivek Goyal 1204f95a04afSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 1205f95a04afSTejun Heo struct blkg_policy_data *pd, int off) 120641b38b6dSTejun Heo { 1207f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 120841b38b6dSTejun Heo struct blkg_rwstat rwstat = { }, tmp; 120941b38b6dSTejun Heo int i, cpu; 121041b38b6dSTejun Heo 121141b38b6dSTejun Heo for_each_possible_cpu(cpu) { 12128a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 121341b38b6dSTejun Heo 121441b38b6dSTejun Heo tmp = blkg_rwstat_read((void *)sc + off); 121541b38b6dSTejun Heo for (i = 0; i < BLKG_RWSTAT_NR; i++) 121641b38b6dSTejun Heo rwstat.cnt[i] += tmp.cnt[i]; 121741b38b6dSTejun Heo } 121841b38b6dSTejun Heo 1219f95a04afSTejun Heo return __blkg_prfill_rwstat(sf, pd, &rwstat); 122041b38b6dSTejun Heo } 122141b38b6dSTejun Heo 12228a3d2615STejun Heo static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 122341b38b6dSTejun Heo struct seq_file *sf) 122441b38b6dSTejun Heo { 12253c798398STejun Heo struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 122641b38b6dSTejun Heo 12273c798398STejun Heo blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 12285bc4afb1STejun Heo cft->private, true); 122941b38b6dSTejun Heo return 0; 123041b38b6dSTejun Heo } 123141b38b6dSTejun Heo 1232f95a04afSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1233f95a04afSTejun Heo int off) 123460c2bc2dSTejun Heo { 1235f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1236f95a04afSTejun Heo u64 v = *(u64 *)((void *)tg + off); 123760c2bc2dSTejun Heo 1238af133cebSTejun Heo if (v == -1) 123960c2bc2dSTejun Heo return 0; 1240f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 124160c2bc2dSTejun Heo } 124260c2bc2dSTejun Heo 1243f95a04afSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1244f95a04afSTejun Heo int off) 1245af133cebSTejun Heo { 1246f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1247f95a04afSTejun Heo unsigned int v = *(unsigned int *)((void *)tg + off); 1248af133cebSTejun Heo 1249af133cebSTejun Heo if (v == -1) 1250af133cebSTejun Heo return 0; 1251f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 1252af133cebSTejun Heo } 1253af133cebSTejun Heo 1254af133cebSTejun Heo static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 125560c2bc2dSTejun Heo struct seq_file *sf) 125660c2bc2dSTejun Heo { 12573c798398STejun Heo blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 12583c798398STejun Heo &blkcg_policy_throtl, cft->private, false); 125960c2bc2dSTejun Heo return 0; 126060c2bc2dSTejun Heo } 126160c2bc2dSTejun Heo 1262af133cebSTejun Heo static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1263af133cebSTejun Heo struct seq_file *sf) 1264e43473b7SVivek Goyal { 12653c798398STejun Heo blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 12663c798398STejun Heo &blkcg_policy_throtl, cft->private, false); 1267af133cebSTejun Heo return 0; 1268e43473b7SVivek Goyal } 1269e43473b7SVivek Goyal 1270af133cebSTejun Heo static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 1271af133cebSTejun Heo bool is_u64) 127260c2bc2dSTejun Heo { 12733c798398STejun Heo struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 127460c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 1275af133cebSTejun Heo struct throtl_grp *tg; 127669df0ab0STejun Heo struct throtl_service_queue *sq; 127760c2bc2dSTejun Heo int ret; 127860c2bc2dSTejun Heo 12793c798398STejun Heo ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 128060c2bc2dSTejun Heo if (ret) 128160c2bc2dSTejun Heo return ret; 128260c2bc2dSTejun Heo 1283af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 128469df0ab0STejun Heo sq = &tg->service_queue; 1285af133cebSTejun Heo 1286af133cebSTejun Heo if (!ctx.v) 1287af133cebSTejun Heo ctx.v = -1; 1288af133cebSTejun Heo 1289af133cebSTejun Heo if (is_u64) 1290af133cebSTejun Heo *(u64 *)((void *)tg + cft->private) = ctx.v; 1291af133cebSTejun Heo else 1292af133cebSTejun Heo *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1293af133cebSTejun Heo 1294fda6f272STejun Heo throtl_log(&tg->service_queue, 1295fda6f272STejun Heo "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1296632b4493STejun Heo tg->bps[READ], tg->bps[WRITE], 1297632b4493STejun Heo tg->iops[READ], tg->iops[WRITE]); 1298632b4493STejun Heo 1299632b4493STejun Heo /* 1300632b4493STejun Heo * We're already holding queue_lock and know @tg is valid. Let's 1301632b4493STejun Heo * apply the new config directly. 1302632b4493STejun Heo * 1303632b4493STejun Heo * Restart the slices for both READ and WRITES. It might happen 1304632b4493STejun Heo * that a group's limit are dropped suddenly and we don't want to 1305632b4493STejun Heo * account recently dispatched IO with new low rate. 1306632b4493STejun Heo */ 13070f3457f6STejun Heo throtl_start_new_slice(tg, 0); 13080f3457f6STejun Heo throtl_start_new_slice(tg, 1); 1309632b4493STejun Heo 13105b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) { 131177216b04STejun Heo tg_update_disptime(tg); 13127f52f98cSTejun Heo throtl_schedule_next_dispatch(sq->parent_sq, true); 1313632b4493STejun Heo } 1314af133cebSTejun Heo 131560c2bc2dSTejun Heo blkg_conf_finish(&ctx); 1316a2b1693bSTejun Heo return 0; 131760c2bc2dSTejun Heo } 131860c2bc2dSTejun Heo 1319af133cebSTejun Heo static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 132060c2bc2dSTejun Heo const char *buf) 132160c2bc2dSTejun Heo { 1322af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, true); 132360c2bc2dSTejun Heo } 132460c2bc2dSTejun Heo 1325af133cebSTejun Heo static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 132660c2bc2dSTejun Heo const char *buf) 132760c2bc2dSTejun Heo { 1328af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, false); 132960c2bc2dSTejun Heo } 133060c2bc2dSTejun Heo 133160c2bc2dSTejun Heo static struct cftype throtl_files[] = { 133260c2bc2dSTejun Heo { 133360c2bc2dSTejun Heo .name = "throttle.read_bps_device", 1334af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 1335af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1336af133cebSTejun Heo .write_string = tg_set_conf_u64, 133760c2bc2dSTejun Heo .max_write_len = 256, 133860c2bc2dSTejun Heo }, 133960c2bc2dSTejun Heo { 134060c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1341af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 1342af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1343af133cebSTejun Heo .write_string = tg_set_conf_u64, 134460c2bc2dSTejun Heo .max_write_len = 256, 134560c2bc2dSTejun Heo }, 134660c2bc2dSTejun Heo { 134760c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1348af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 1349af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1350af133cebSTejun Heo .write_string = tg_set_conf_uint, 135160c2bc2dSTejun Heo .max_write_len = 256, 135260c2bc2dSTejun Heo }, 135360c2bc2dSTejun Heo { 135460c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1355af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 1356af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1357af133cebSTejun Heo .write_string = tg_set_conf_uint, 135860c2bc2dSTejun Heo .max_write_len = 256, 135960c2bc2dSTejun Heo }, 136060c2bc2dSTejun Heo { 136160c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 13625bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, service_bytes), 13638a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 136460c2bc2dSTejun Heo }, 136560c2bc2dSTejun Heo { 136660c2bc2dSTejun Heo .name = "throttle.io_serviced", 13675bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, serviced), 13688a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 136960c2bc2dSTejun Heo }, 137060c2bc2dSTejun Heo { } /* terminate */ 137160c2bc2dSTejun Heo }; 137260c2bc2dSTejun Heo 1373da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1374e43473b7SVivek Goyal { 1375e43473b7SVivek Goyal struct throtl_data *td = q->td; 1376e43473b7SVivek Goyal 137769df0ab0STejun Heo cancel_work_sync(&td->dispatch_work); 1378e43473b7SVivek Goyal } 1379e43473b7SVivek Goyal 13803c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl = { 1381f9fcc2d3STejun Heo .pd_size = sizeof(struct throtl_grp), 1382f9fcc2d3STejun Heo .cftypes = throtl_files, 1383f9fcc2d3STejun Heo 13843c798398STejun Heo .pd_init_fn = throtl_pd_init, 13853c798398STejun Heo .pd_exit_fn = throtl_pd_exit, 13863c798398STejun Heo .pd_reset_stats_fn = throtl_pd_reset_stats, 1387e43473b7SVivek Goyal }; 1388e43473b7SVivek Goyal 1389bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1390e43473b7SVivek Goyal { 1391e43473b7SVivek Goyal struct throtl_data *td = q->td; 1392*c5cc2070STejun Heo struct throtl_qnode *qn = NULL; 1393e43473b7SVivek Goyal struct throtl_grp *tg; 139473f0d49aSTejun Heo struct throtl_service_queue *sq; 13950e9f4164STejun Heo bool rw = bio_data_dir(bio); 13963c798398STejun Heo struct blkcg *blkcg; 1397bc16a4f9STejun Heo bool throttled = false; 1398e43473b7SVivek Goyal 13992a0f61e6STejun Heo /* see throtl_charge_bio() */ 14002a0f61e6STejun Heo if (bio->bi_rw & REQ_THROTTLED) 1401bc16a4f9STejun Heo goto out; 1402e43473b7SVivek Goyal 1403af75cd3cSVivek Goyal /* 1404af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1405af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1406af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1407af75cd3cSVivek Goyal */ 1408af75cd3cSVivek Goyal rcu_read_lock(); 14093c798398STejun Heo blkcg = bio_blkcg(bio); 1410cd1604faSTejun Heo tg = throtl_lookup_tg(td, blkcg); 1411af75cd3cSVivek Goyal if (tg) { 1412af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1413629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), 1414629ed0b1STejun Heo bio->bi_size, bio->bi_rw); 14152a7f1244STejun Heo goto out_unlock_rcu; 1416af75cd3cSVivek Goyal } 1417af75cd3cSVivek Goyal } 1418af75cd3cSVivek Goyal 1419af75cd3cSVivek Goyal /* 1420af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1421af75cd3cSVivek Goyal * IO group 1422af75cd3cSVivek Goyal */ 1423e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1424cd1604faSTejun Heo tg = throtl_lookup_create_tg(td, blkcg); 1425bc16a4f9STejun Heo if (unlikely(!tg)) 1426bc16a4f9STejun Heo goto out_unlock; 1427f469a7b4SVivek Goyal 142873f0d49aSTejun Heo sq = &tg->service_queue; 142973f0d49aSTejun Heo 14309e660acfSTejun Heo while (true) { 14319e660acfSTejun Heo /* throtl is FIFO - if bios are already queued, should queue */ 14320e9f4164STejun Heo if (sq->nr_queued[rw]) 14339e660acfSTejun Heo break; 1434de701c74SVivek Goyal 14359e660acfSTejun Heo /* if above limits, break to queue */ 14369e660acfSTejun Heo if (!tg_may_dispatch(tg, bio, NULL)) 14379e660acfSTejun Heo break; 14389e660acfSTejun Heo 14399e660acfSTejun Heo /* within limits, let's charge and dispatch directly */ 1440e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 144104521db0SVivek Goyal 144204521db0SVivek Goyal /* 144304521db0SVivek Goyal * We need to trim slice even when bios are not being queued 144404521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 144504521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 144604521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 144704521db0SVivek Goyal * we take into account all the IO dispatched so far at new 144804521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 144904521db0SVivek Goyal * time. 145004521db0SVivek Goyal * 145104521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 145204521db0SVivek Goyal */ 14530f3457f6STejun Heo throtl_trim_slice(tg, rw); 14549e660acfSTejun Heo 14559e660acfSTejun Heo /* 14569e660acfSTejun Heo * @bio passed through this layer without being throttled. 14579e660acfSTejun Heo * Climb up the ladder. If we''re already at the top, it 14589e660acfSTejun Heo * can be executed directly. 14599e660acfSTejun Heo */ 1460*c5cc2070STejun Heo qn = &tg->qnode_on_parent[rw]; 14619e660acfSTejun Heo sq = sq->parent_sq; 14629e660acfSTejun Heo tg = sq_to_tg(sq); 14639e660acfSTejun Heo if (!tg) 1464bc16a4f9STejun Heo goto out_unlock; 1465e43473b7SVivek Goyal } 1466e43473b7SVivek Goyal 14679e660acfSTejun Heo /* out-of-limit, queue to @tg */ 1468fda6f272STejun Heo throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 14698e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1470e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 14718e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 147273f0d49aSTejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1473e43473b7SVivek Goyal 1474671058fbSTejun Heo bio_associate_current(bio); 14756bc9c2b4STejun Heo tg->td->nr_queued[rw]++; 1476*c5cc2070STejun Heo throtl_add_bio_tg(bio, qn, tg); 1477bc16a4f9STejun Heo throttled = true; 1478e43473b7SVivek Goyal 14797f52f98cSTejun Heo /* 14807f52f98cSTejun Heo * Update @tg's dispatch time and force schedule dispatch if @tg 14817f52f98cSTejun Heo * was empty before @bio. The forced scheduling isn't likely to 14827f52f98cSTejun Heo * cause undue delay as @bio is likely to be dispatched directly if 14837f52f98cSTejun Heo * its @tg's disptime is not in the future. 14847f52f98cSTejun Heo */ 14850e9f4164STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 148677216b04STejun Heo tg_update_disptime(tg); 14877f52f98cSTejun Heo throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1488e43473b7SVivek Goyal } 1489e43473b7SVivek Goyal 1490bc16a4f9STejun Heo out_unlock: 1491e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 14922a7f1244STejun Heo out_unlock_rcu: 14932a7f1244STejun Heo rcu_read_unlock(); 1494bc16a4f9STejun Heo out: 14952a0f61e6STejun Heo /* 14962a0f61e6STejun Heo * As multiple blk-throtls may stack in the same issue path, we 14972a0f61e6STejun Heo * don't want bios to leave with the flag set. Clear the flag if 14982a0f61e6STejun Heo * being issued. 14992a0f61e6STejun Heo */ 15002a0f61e6STejun Heo if (!throttled) 15012a0f61e6STejun Heo bio->bi_rw &= ~REQ_THROTTLED; 1502bc16a4f9STejun Heo return throttled; 1503e43473b7SVivek Goyal } 1504e43473b7SVivek Goyal 15052a12f0dcSTejun Heo /* 15062a12f0dcSTejun Heo * Dispatch all bios from all children tg's queued on @parent_sq. On 15072a12f0dcSTejun Heo * return, @parent_sq is guaranteed to not have any active children tg's 15082a12f0dcSTejun Heo * and all bios from previously active tg's are on @parent_sq->bio_lists[]. 15092a12f0dcSTejun Heo */ 15102a12f0dcSTejun Heo static void tg_drain_bios(struct throtl_service_queue *parent_sq) 15112a12f0dcSTejun Heo { 15122a12f0dcSTejun Heo struct throtl_grp *tg; 15132a12f0dcSTejun Heo 15142a12f0dcSTejun Heo while ((tg = throtl_rb_first(parent_sq))) { 15152a12f0dcSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 15162a12f0dcSTejun Heo struct bio *bio; 15172a12f0dcSTejun Heo 15182a12f0dcSTejun Heo throtl_dequeue_tg(tg); 15192a12f0dcSTejun Heo 1520*c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ]))) 15212a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1522*c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 15232a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 15242a12f0dcSTejun Heo } 15252a12f0dcSTejun Heo } 15262a12f0dcSTejun Heo 1527c9a929ddSTejun Heo /** 1528c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1529c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1530c9a929ddSTejun Heo * 1531c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1532c9a929ddSTejun Heo */ 1533c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1534c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1535c9a929ddSTejun Heo { 1536c9a929ddSTejun Heo struct throtl_data *td = q->td; 15372a12f0dcSTejun Heo struct blkcg_gq *blkg; 15382a12f0dcSTejun Heo struct cgroup *pos_cgrp; 1539c9a929ddSTejun Heo struct bio *bio; 1540651930bcSTejun Heo int rw; 1541c9a929ddSTejun Heo 15428bcb6c7dSAndi Kleen queue_lockdep_assert_held(q); 15432a12f0dcSTejun Heo rcu_read_lock(); 1544c9a929ddSTejun Heo 15452a12f0dcSTejun Heo /* 15462a12f0dcSTejun Heo * Drain each tg while doing post-order walk on the blkg tree, so 15472a12f0dcSTejun Heo * that all bios are propagated to td->service_queue. It'd be 15482a12f0dcSTejun Heo * better to walk service_queue tree directly but blkg walk is 15492a12f0dcSTejun Heo * easier. 15502a12f0dcSTejun Heo */ 15512a12f0dcSTejun Heo blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) 15522a12f0dcSTejun Heo tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 155373f0d49aSTejun Heo 15542a12f0dcSTejun Heo tg_drain_bios(&td_root_tg(td)->service_queue); 1555c9a929ddSTejun Heo 15562a12f0dcSTejun Heo /* finally, transfer bios from top-level tg's into the td */ 15572a12f0dcSTejun Heo tg_drain_bios(&td->service_queue); 15582a12f0dcSTejun Heo 15592a12f0dcSTejun Heo rcu_read_unlock(); 1560c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1561c9a929ddSTejun Heo 15622a12f0dcSTejun Heo /* all bios now should be in td->service_queue, issue them */ 1563651930bcSTejun Heo for (rw = READ; rw <= WRITE; rw++) 1564*c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], 1565*c5cc2070STejun Heo NULL))) 1566c9a929ddSTejun Heo generic_make_request(bio); 1567c9a929ddSTejun Heo 1568c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1569c9a929ddSTejun Heo } 1570c9a929ddSTejun Heo 1571e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1572e43473b7SVivek Goyal { 1573e43473b7SVivek Goyal struct throtl_data *td; 1574a2b1693bSTejun Heo int ret; 1575e43473b7SVivek Goyal 1576e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1577e43473b7SVivek Goyal if (!td) 1578e43473b7SVivek Goyal return -ENOMEM; 1579e43473b7SVivek Goyal 158069df0ab0STejun Heo INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 158177216b04STejun Heo throtl_service_queue_init(&td->service_queue, NULL); 1582e43473b7SVivek Goyal 1583cd1604faSTejun Heo q->td = td; 158429b12589SVivek Goyal td->queue = q; 158502977e4aSVivek Goyal 1586a2b1693bSTejun Heo /* activate policy */ 15873c798398STejun Heo ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1588a2b1693bSTejun Heo if (ret) 158929b12589SVivek Goyal kfree(td); 1590a2b1693bSTejun Heo return ret; 1591e43473b7SVivek Goyal } 1592e43473b7SVivek Goyal 1593e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1594e43473b7SVivek Goyal { 1595c875f4d0STejun Heo BUG_ON(!q->td); 1596da527770SVivek Goyal throtl_shutdown_wq(q); 15973c798398STejun Heo blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1598c9a929ddSTejun Heo kfree(q->td); 1599e43473b7SVivek Goyal } 1600e43473b7SVivek Goyal 1601e43473b7SVivek Goyal static int __init throtl_init(void) 1602e43473b7SVivek Goyal { 1603450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1604450adcbeSVivek Goyal if (!kthrotld_workqueue) 1605450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1606450adcbeSVivek Goyal 16073c798398STejun Heo return blkcg_policy_register(&blkcg_policy_throtl); 1608e43473b7SVivek Goyal } 1609e43473b7SVivek Goyal 1610e43473b7SVivek Goyal module_init(throtl_init); 1611