1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 243c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal 29c5cc2070STejun Heo /* 30c5cc2070STejun Heo * To implement hierarchical throttling, throtl_grps form a tree and bios 31c5cc2070STejun Heo * are dispatched upwards level by level until they reach the top and get 32c5cc2070STejun Heo * issued. When dispatching bios from the children and local group at each 33c5cc2070STejun Heo * level, if the bios are dispatched into a single bio_list, there's a risk 34c5cc2070STejun Heo * of a local or child group which can queue many bios at once filling up 35c5cc2070STejun Heo * the list starving others. 36c5cc2070STejun Heo * 37c5cc2070STejun Heo * To avoid such starvation, dispatched bios are queued separately 38c5cc2070STejun Heo * according to where they came from. When they are again dispatched to 39c5cc2070STejun Heo * the parent, they're popped in round-robin order so that no single source 40c5cc2070STejun Heo * hogs the dispatch window. 41c5cc2070STejun Heo * 42c5cc2070STejun Heo * throtl_qnode is used to keep the queued bios separated by their sources. 43c5cc2070STejun Heo * Bios are queued to throtl_qnode which in turn is queued to 44c5cc2070STejun Heo * throtl_service_queue and then dispatched in round-robin order. 45c5cc2070STejun Heo * 46c5cc2070STejun Heo * It's also used to track the reference counts on blkg's. A qnode always 47c5cc2070STejun Heo * belongs to a throtl_grp and gets queued on itself or the parent, so 48c5cc2070STejun Heo * incrementing the reference of the associated throtl_grp when a qnode is 49c5cc2070STejun Heo * queued and decrementing when dequeued is enough to keep the whole blkg 50c5cc2070STejun Heo * tree pinned while bios are in flight. 51c5cc2070STejun Heo */ 52c5cc2070STejun Heo struct throtl_qnode { 53c5cc2070STejun Heo struct list_head node; /* service_queue->queued[] */ 54c5cc2070STejun Heo struct bio_list bios; /* queued bios */ 55c5cc2070STejun Heo struct throtl_grp *tg; /* tg this qnode belongs to */ 56c5cc2070STejun Heo }; 57c5cc2070STejun Heo 58c9e0332eSTejun Heo struct throtl_service_queue { 5977216b04STejun Heo struct throtl_service_queue *parent_sq; /* the parent service_queue */ 6077216b04STejun Heo 6173f0d49aSTejun Heo /* 6273f0d49aSTejun Heo * Bios queued directly to this service_queue or dispatched from 6373f0d49aSTejun Heo * children throtl_grp's. 6473f0d49aSTejun Heo */ 65c5cc2070STejun Heo struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 6673f0d49aSTejun Heo unsigned int nr_queued[2]; /* number of queued bios */ 6773f0d49aSTejun Heo 6873f0d49aSTejun Heo /* 6973f0d49aSTejun Heo * RB tree of active children throtl_grp's, which are sorted by 7073f0d49aSTejun Heo * their ->disptime. 7173f0d49aSTejun Heo */ 72c9e0332eSTejun Heo struct rb_root pending_tree; /* RB tree of active tgs */ 73c9e0332eSTejun Heo struct rb_node *first_pending; /* first node in the tree */ 74c9e0332eSTejun Heo unsigned int nr_pending; /* # queued in the tree */ 75c9e0332eSTejun Heo unsigned long first_pending_disptime; /* disptime of the first tg */ 7669df0ab0STejun Heo struct timer_list pending_timer; /* fires on first_pending_disptime */ 77e43473b7SVivek Goyal }; 78e43473b7SVivek Goyal 795b2c16aaSTejun Heo enum tg_state_flags { 805b2c16aaSTejun Heo THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 810e9f4164STejun Heo THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 825b2c16aaSTejun Heo }; 835b2c16aaSTejun Heo 84e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 85e43473b7SVivek Goyal 868a3d2615STejun Heo /* Per-cpu group stats */ 878a3d2615STejun Heo struct tg_stats_cpu { 888a3d2615STejun Heo /* total bytes transferred */ 898a3d2615STejun Heo struct blkg_rwstat service_bytes; 908a3d2615STejun Heo /* total IOs serviced, post merge */ 918a3d2615STejun Heo struct blkg_rwstat serviced; 928a3d2615STejun Heo }; 938a3d2615STejun Heo 94e43473b7SVivek Goyal struct throtl_grp { 95f95a04afSTejun Heo /* must be the first member */ 96f95a04afSTejun Heo struct blkg_policy_data pd; 97f95a04afSTejun Heo 98c9e0332eSTejun Heo /* active throtl group service_queue member */ 99e43473b7SVivek Goyal struct rb_node rb_node; 100e43473b7SVivek Goyal 1010f3457f6STejun Heo /* throtl_data this group belongs to */ 1020f3457f6STejun Heo struct throtl_data *td; 1030f3457f6STejun Heo 10449a2f1e3STejun Heo /* this group's service queue */ 10549a2f1e3STejun Heo struct throtl_service_queue service_queue; 10649a2f1e3STejun Heo 107e43473b7SVivek Goyal /* 108c5cc2070STejun Heo * qnode_on_self is used when bios are directly queued to this 109c5cc2070STejun Heo * throtl_grp so that local bios compete fairly with bios 110c5cc2070STejun Heo * dispatched from children. qnode_on_parent is used when bios are 111c5cc2070STejun Heo * dispatched from this throtl_grp into its parent and will compete 112c5cc2070STejun Heo * with the sibling qnode_on_parents and the parent's 113c5cc2070STejun Heo * qnode_on_self. 114c5cc2070STejun Heo */ 115c5cc2070STejun Heo struct throtl_qnode qnode_on_self[2]; 116c5cc2070STejun Heo struct throtl_qnode qnode_on_parent[2]; 117c5cc2070STejun Heo 118c5cc2070STejun Heo /* 119e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 120e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 121e43473b7SVivek Goyal * key to sort active groups in service tree. 122e43473b7SVivek Goyal */ 123e43473b7SVivek Goyal unsigned long disptime; 124e43473b7SVivek Goyal 125e43473b7SVivek Goyal unsigned int flags; 126e43473b7SVivek Goyal 127693e751eSTejun Heo /* are there any throtl rules between this group and td? */ 128693e751eSTejun Heo bool has_rules[2]; 129693e751eSTejun Heo 130e43473b7SVivek Goyal /* bytes per second rate limits */ 131e43473b7SVivek Goyal uint64_t bps[2]; 132e43473b7SVivek Goyal 1338e89d13fSVivek Goyal /* IOPS limits */ 1348e89d13fSVivek Goyal unsigned int iops[2]; 1358e89d13fSVivek Goyal 136e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 137e43473b7SVivek Goyal uint64_t bytes_disp[2]; 1388e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 1398e89d13fSVivek Goyal unsigned int io_disp[2]; 140e43473b7SVivek Goyal 141e43473b7SVivek Goyal /* When did we start a new slice */ 142e43473b7SVivek Goyal unsigned long slice_start[2]; 143e43473b7SVivek Goyal unsigned long slice_end[2]; 144fe071437SVivek Goyal 1458a3d2615STejun Heo /* Per cpu stats pointer */ 1468a3d2615STejun Heo struct tg_stats_cpu __percpu *stats_cpu; 1478a3d2615STejun Heo 1488a3d2615STejun Heo /* List of tgs waiting for per cpu stats memory to be allocated */ 1498a3d2615STejun Heo struct list_head stats_alloc_node; 150e43473b7SVivek Goyal }; 151e43473b7SVivek Goyal 152e43473b7SVivek Goyal struct throtl_data 153e43473b7SVivek Goyal { 154e43473b7SVivek Goyal /* service tree for active throtl groups */ 155c9e0332eSTejun Heo struct throtl_service_queue service_queue; 156e43473b7SVivek Goyal 157e43473b7SVivek Goyal struct request_queue *queue; 158e43473b7SVivek Goyal 159e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 160e43473b7SVivek Goyal unsigned int nr_queued[2]; 161e43473b7SVivek Goyal 162e43473b7SVivek Goyal /* 16302977e4aSVivek Goyal * number of total undestroyed groups 164e43473b7SVivek Goyal */ 165e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 166e43473b7SVivek Goyal 167e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 16869df0ab0STejun Heo struct work_struct dispatch_work; 169e43473b7SVivek Goyal }; 170e43473b7SVivek Goyal 1718a3d2615STejun Heo /* list and work item to allocate percpu group stats */ 1728a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock); 1738a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list); 1748a3d2615STejun Heo 1758a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *); 1768a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 1778a3d2615STejun Heo 17869df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg); 17969df0ab0STejun Heo 180f95a04afSTejun Heo static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 181f95a04afSTejun Heo { 182f95a04afSTejun Heo return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 183f95a04afSTejun Heo } 184f95a04afSTejun Heo 1853c798398STejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 1860381411eSTejun Heo { 187f95a04afSTejun Heo return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 1880381411eSTejun Heo } 1890381411eSTejun Heo 1903c798398STejun Heo static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 1910381411eSTejun Heo { 192f95a04afSTejun Heo return pd_to_blkg(&tg->pd); 1930381411eSTejun Heo } 1940381411eSTejun Heo 19503d8e111STejun Heo static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 19603d8e111STejun Heo { 19703d8e111STejun Heo return blkg_to_tg(td->queue->root_blkg); 19803d8e111STejun Heo } 19903d8e111STejun Heo 200fda6f272STejun Heo /** 201fda6f272STejun Heo * sq_to_tg - return the throl_grp the specified service queue belongs to 202fda6f272STejun Heo * @sq: the throtl_service_queue of interest 203fda6f272STejun Heo * 204fda6f272STejun Heo * Return the throtl_grp @sq belongs to. If @sq is the top-level one 205fda6f272STejun Heo * embedded in throtl_data, %NULL is returned. 206fda6f272STejun Heo */ 207fda6f272STejun Heo static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 208fda6f272STejun Heo { 209fda6f272STejun Heo if (sq && sq->parent_sq) 210fda6f272STejun Heo return container_of(sq, struct throtl_grp, service_queue); 211fda6f272STejun Heo else 212fda6f272STejun Heo return NULL; 213fda6f272STejun Heo } 214fda6f272STejun Heo 215fda6f272STejun Heo /** 216fda6f272STejun Heo * sq_to_td - return throtl_data the specified service queue belongs to 217fda6f272STejun Heo * @sq: the throtl_service_queue of interest 218fda6f272STejun Heo * 219fda6f272STejun Heo * A service_queue can be embeded in either a throtl_grp or throtl_data. 220fda6f272STejun Heo * Determine the associated throtl_data accordingly and return it. 221fda6f272STejun Heo */ 222fda6f272STejun Heo static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 223fda6f272STejun Heo { 224fda6f272STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 225fda6f272STejun Heo 226fda6f272STejun Heo if (tg) 227fda6f272STejun Heo return tg->td; 228fda6f272STejun Heo else 229fda6f272STejun Heo return container_of(sq, struct throtl_data, service_queue); 230fda6f272STejun Heo } 231fda6f272STejun Heo 232fda6f272STejun Heo /** 233fda6f272STejun Heo * throtl_log - log debug message via blktrace 234fda6f272STejun Heo * @sq: the service_queue being reported 235fda6f272STejun Heo * @fmt: printf format string 236fda6f272STejun Heo * @args: printf args 237fda6f272STejun Heo * 238fda6f272STejun Heo * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 239fda6f272STejun Heo * throtl_grp; otherwise, just "throtl". 240fda6f272STejun Heo * 241fda6f272STejun Heo * TODO: this should be made a function and name formatting should happen 242fda6f272STejun Heo * after testing whether blktrace is enabled. 243fda6f272STejun Heo */ 244fda6f272STejun Heo #define throtl_log(sq, fmt, args...) do { \ 245fda6f272STejun Heo struct throtl_grp *__tg = sq_to_tg((sq)); \ 246fda6f272STejun Heo struct throtl_data *__td = sq_to_td((sq)); \ 247fda6f272STejun Heo \ 248fda6f272STejun Heo (void)__td; \ 249fda6f272STejun Heo if ((__tg)) { \ 25054e7ed12STejun Heo char __pbuf[128]; \ 25154e7ed12STejun Heo \ 252fda6f272STejun Heo blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ 253fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ 254fda6f272STejun Heo } else { \ 255fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 256fda6f272STejun Heo } \ 25754e7ed12STejun Heo } while (0) 258e43473b7SVivek Goyal 25990d3839bSPeter Zijlstra static void tg_stats_init(struct tg_stats_cpu *tg_stats) 26090d3839bSPeter Zijlstra { 26190d3839bSPeter Zijlstra blkg_rwstat_init(&tg_stats->service_bytes); 26290d3839bSPeter Zijlstra blkg_rwstat_init(&tg_stats->serviced); 26390d3839bSPeter Zijlstra } 26490d3839bSPeter Zijlstra 2658a3d2615STejun Heo /* 2668a3d2615STejun Heo * Worker for allocating per cpu stat for tgs. This is scheduled on the 2673b07e9caSTejun Heo * system_wq once there are some groups on the alloc_list waiting for 2688a3d2615STejun Heo * allocation. 2698a3d2615STejun Heo */ 2708a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work) 2718a3d2615STejun Heo { 2728a3d2615STejun Heo static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 2738a3d2615STejun Heo struct delayed_work *dwork = to_delayed_work(work); 2748a3d2615STejun Heo bool empty = false; 2758a3d2615STejun Heo 2768a3d2615STejun Heo alloc_stats: 2778a3d2615STejun Heo if (!stats_cpu) { 27890d3839bSPeter Zijlstra int cpu; 27990d3839bSPeter Zijlstra 2808a3d2615STejun Heo stats_cpu = alloc_percpu(struct tg_stats_cpu); 2818a3d2615STejun Heo if (!stats_cpu) { 2828a3d2615STejun Heo /* allocation failed, try again after some time */ 2833b07e9caSTejun Heo schedule_delayed_work(dwork, msecs_to_jiffies(10)); 2848a3d2615STejun Heo return; 2858a3d2615STejun Heo } 28690d3839bSPeter Zijlstra for_each_possible_cpu(cpu) 28790d3839bSPeter Zijlstra tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); 2888a3d2615STejun Heo } 2898a3d2615STejun Heo 2908a3d2615STejun Heo spin_lock_irq(&tg_stats_alloc_lock); 2918a3d2615STejun Heo 2928a3d2615STejun Heo if (!list_empty(&tg_stats_alloc_list)) { 2938a3d2615STejun Heo struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 2948a3d2615STejun Heo struct throtl_grp, 2958a3d2615STejun Heo stats_alloc_node); 2968a3d2615STejun Heo swap(tg->stats_cpu, stats_cpu); 2978a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 2988a3d2615STejun Heo } 2998a3d2615STejun Heo 3008a3d2615STejun Heo empty = list_empty(&tg_stats_alloc_list); 3018a3d2615STejun Heo spin_unlock_irq(&tg_stats_alloc_lock); 3028a3d2615STejun Heo if (!empty) 3038a3d2615STejun Heo goto alloc_stats; 3048a3d2615STejun Heo } 3058a3d2615STejun Heo 306c5cc2070STejun Heo static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 307c5cc2070STejun Heo { 308c5cc2070STejun Heo INIT_LIST_HEAD(&qn->node); 309c5cc2070STejun Heo bio_list_init(&qn->bios); 310c5cc2070STejun Heo qn->tg = tg; 311c5cc2070STejun Heo } 312c5cc2070STejun Heo 313c5cc2070STejun Heo /** 314c5cc2070STejun Heo * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 315c5cc2070STejun Heo * @bio: bio being added 316c5cc2070STejun Heo * @qn: qnode to add bio to 317c5cc2070STejun Heo * @queued: the service_queue->queued[] list @qn belongs to 318c5cc2070STejun Heo * 319c5cc2070STejun Heo * Add @bio to @qn and put @qn on @queued if it's not already on. 320c5cc2070STejun Heo * @qn->tg's reference count is bumped when @qn is activated. See the 321c5cc2070STejun Heo * comment on top of throtl_qnode definition for details. 322c5cc2070STejun Heo */ 323c5cc2070STejun Heo static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 324c5cc2070STejun Heo struct list_head *queued) 325c5cc2070STejun Heo { 326c5cc2070STejun Heo bio_list_add(&qn->bios, bio); 327c5cc2070STejun Heo if (list_empty(&qn->node)) { 328c5cc2070STejun Heo list_add_tail(&qn->node, queued); 329c5cc2070STejun Heo blkg_get(tg_to_blkg(qn->tg)); 330c5cc2070STejun Heo } 331c5cc2070STejun Heo } 332c5cc2070STejun Heo 333c5cc2070STejun Heo /** 334c5cc2070STejun Heo * throtl_peek_queued - peek the first bio on a qnode list 335c5cc2070STejun Heo * @queued: the qnode list to peek 336c5cc2070STejun Heo */ 337c5cc2070STejun Heo static struct bio *throtl_peek_queued(struct list_head *queued) 338c5cc2070STejun Heo { 339c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 340c5cc2070STejun Heo struct bio *bio; 341c5cc2070STejun Heo 342c5cc2070STejun Heo if (list_empty(queued)) 343c5cc2070STejun Heo return NULL; 344c5cc2070STejun Heo 345c5cc2070STejun Heo bio = bio_list_peek(&qn->bios); 346c5cc2070STejun Heo WARN_ON_ONCE(!bio); 347c5cc2070STejun Heo return bio; 348c5cc2070STejun Heo } 349c5cc2070STejun Heo 350c5cc2070STejun Heo /** 351c5cc2070STejun Heo * throtl_pop_queued - pop the first bio form a qnode list 352c5cc2070STejun Heo * @queued: the qnode list to pop a bio from 353c5cc2070STejun Heo * @tg_to_put: optional out argument for throtl_grp to put 354c5cc2070STejun Heo * 355c5cc2070STejun Heo * Pop the first bio from the qnode list @queued. After popping, the first 356c5cc2070STejun Heo * qnode is removed from @queued if empty or moved to the end of @queued so 357c5cc2070STejun Heo * that the popping order is round-robin. 358c5cc2070STejun Heo * 359c5cc2070STejun Heo * When the first qnode is removed, its associated throtl_grp should be put 360c5cc2070STejun Heo * too. If @tg_to_put is NULL, this function automatically puts it; 361c5cc2070STejun Heo * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 362c5cc2070STejun Heo * responsible for putting it. 363c5cc2070STejun Heo */ 364c5cc2070STejun Heo static struct bio *throtl_pop_queued(struct list_head *queued, 365c5cc2070STejun Heo struct throtl_grp **tg_to_put) 366c5cc2070STejun Heo { 367c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 368c5cc2070STejun Heo struct bio *bio; 369c5cc2070STejun Heo 370c5cc2070STejun Heo if (list_empty(queued)) 371c5cc2070STejun Heo return NULL; 372c5cc2070STejun Heo 373c5cc2070STejun Heo bio = bio_list_pop(&qn->bios); 374c5cc2070STejun Heo WARN_ON_ONCE(!bio); 375c5cc2070STejun Heo 376c5cc2070STejun Heo if (bio_list_empty(&qn->bios)) { 377c5cc2070STejun Heo list_del_init(&qn->node); 378c5cc2070STejun Heo if (tg_to_put) 379c5cc2070STejun Heo *tg_to_put = qn->tg; 380c5cc2070STejun Heo else 381c5cc2070STejun Heo blkg_put(tg_to_blkg(qn->tg)); 382c5cc2070STejun Heo } else { 383c5cc2070STejun Heo list_move_tail(&qn->node, queued); 384c5cc2070STejun Heo } 385c5cc2070STejun Heo 386c5cc2070STejun Heo return bio; 387c5cc2070STejun Heo } 388c5cc2070STejun Heo 38949a2f1e3STejun Heo /* init a service_queue, assumes the caller zeroed it */ 39077216b04STejun Heo static void throtl_service_queue_init(struct throtl_service_queue *sq, 39177216b04STejun Heo struct throtl_service_queue *parent_sq) 39249a2f1e3STejun Heo { 393c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[0]); 394c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[1]); 39549a2f1e3STejun Heo sq->pending_tree = RB_ROOT; 39677216b04STejun Heo sq->parent_sq = parent_sq; 39769df0ab0STejun Heo setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 39869df0ab0STejun Heo (unsigned long)sq); 39969df0ab0STejun Heo } 40069df0ab0STejun Heo 40169df0ab0STejun Heo static void throtl_service_queue_exit(struct throtl_service_queue *sq) 40269df0ab0STejun Heo { 40369df0ab0STejun Heo del_timer_sync(&sq->pending_timer); 40449a2f1e3STejun Heo } 40549a2f1e3STejun Heo 4063c798398STejun Heo static void throtl_pd_init(struct blkcg_gq *blkg) 407a29a171eSVivek Goyal { 4080381411eSTejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 40977216b04STejun Heo struct throtl_data *td = blkg->q->td; 4109138125bSTejun Heo struct throtl_service_queue *parent_sq; 411ff26eaadSTejun Heo unsigned long flags; 412c5cc2070STejun Heo int rw; 413cd1604faSTejun Heo 4149138125bSTejun Heo /* 4159138125bSTejun Heo * If sane_hierarchy is enabled, we switch to properly hierarchical 4169138125bSTejun Heo * behavior where limits on a given throtl_grp are applied to the 4179138125bSTejun Heo * whole subtree rather than just the group itself. e.g. If 16M 4189138125bSTejun Heo * read_bps limit is set on the root group, the whole system can't 4199138125bSTejun Heo * exceed 16M for the device. 4209138125bSTejun Heo * 4219138125bSTejun Heo * If sane_hierarchy is not enabled, the broken flat hierarchy 4229138125bSTejun Heo * behavior is retained where all throtl_grps are treated as if 4239138125bSTejun Heo * they're all separate root groups right below throtl_data. 4249138125bSTejun Heo * Limits of a group don't interact with limits of other groups 4259138125bSTejun Heo * regardless of the position of the group in the hierarchy. 4269138125bSTejun Heo */ 4279138125bSTejun Heo parent_sq = &td->service_queue; 4289138125bSTejun Heo 4299138125bSTejun Heo if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) 4309138125bSTejun Heo parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 4319138125bSTejun Heo 4329138125bSTejun Heo throtl_service_queue_init(&tg->service_queue, parent_sq); 4339138125bSTejun Heo 434c5cc2070STejun Heo for (rw = READ; rw <= WRITE; rw++) { 435c5cc2070STejun Heo throtl_qnode_init(&tg->qnode_on_self[rw], tg); 436c5cc2070STejun Heo throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 437c5cc2070STejun Heo } 438c5cc2070STejun Heo 439a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 44077216b04STejun Heo tg->td = td; 441a29a171eSVivek Goyal 442e56da7e2STejun Heo tg->bps[READ] = -1; 443e56da7e2STejun Heo tg->bps[WRITE] = -1; 444e56da7e2STejun Heo tg->iops[READ] = -1; 445e56da7e2STejun Heo tg->iops[WRITE] = -1; 4468a3d2615STejun Heo 4478a3d2615STejun Heo /* 4488a3d2615STejun Heo * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 4498a3d2615STejun Heo * but percpu allocator can't be called from IO path. Queue tg on 4508a3d2615STejun Heo * tg_stats_alloc_list and allocate from work item. 4518a3d2615STejun Heo */ 452ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 4538a3d2615STejun Heo list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 4543b07e9caSTejun Heo schedule_delayed_work(&tg_stats_alloc_work, 0); 455ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 4568a3d2615STejun Heo } 4578a3d2615STejun Heo 458693e751eSTejun Heo /* 459693e751eSTejun Heo * Set has_rules[] if @tg or any of its parents have limits configured. 460693e751eSTejun Heo * This doesn't require walking up to the top of the hierarchy as the 461693e751eSTejun Heo * parent's has_rules[] is guaranteed to be correct. 462693e751eSTejun Heo */ 463693e751eSTejun Heo static void tg_update_has_rules(struct throtl_grp *tg) 464693e751eSTejun Heo { 465693e751eSTejun Heo struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 466693e751eSTejun Heo int rw; 467693e751eSTejun Heo 468693e751eSTejun Heo for (rw = READ; rw <= WRITE; rw++) 469693e751eSTejun Heo tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 470693e751eSTejun Heo (tg->bps[rw] != -1 || tg->iops[rw] != -1); 471693e751eSTejun Heo } 472693e751eSTejun Heo 473693e751eSTejun Heo static void throtl_pd_online(struct blkcg_gq *blkg) 474693e751eSTejun Heo { 475693e751eSTejun Heo /* 476693e751eSTejun Heo * We don't want new groups to escape the limits of its ancestors. 477693e751eSTejun Heo * Update has_rules[] after a new group is brought online. 478693e751eSTejun Heo */ 479693e751eSTejun Heo tg_update_has_rules(blkg_to_tg(blkg)); 480693e751eSTejun Heo } 481693e751eSTejun Heo 4823c798398STejun Heo static void throtl_pd_exit(struct blkcg_gq *blkg) 4838a3d2615STejun Heo { 4848a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 485ff26eaadSTejun Heo unsigned long flags; 4868a3d2615STejun Heo 487ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 4888a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 489ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 4908a3d2615STejun Heo 4918a3d2615STejun Heo free_percpu(tg->stats_cpu); 49269df0ab0STejun Heo 49369df0ab0STejun Heo throtl_service_queue_exit(&tg->service_queue); 4948a3d2615STejun Heo } 4958a3d2615STejun Heo 4963c798398STejun Heo static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 4978a3d2615STejun Heo { 4988a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 4998a3d2615STejun Heo int cpu; 5008a3d2615STejun Heo 5018a3d2615STejun Heo if (tg->stats_cpu == NULL) 5028a3d2615STejun Heo return; 5038a3d2615STejun Heo 5048a3d2615STejun Heo for_each_possible_cpu(cpu) { 5058a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 5068a3d2615STejun Heo 5078a3d2615STejun Heo blkg_rwstat_reset(&sc->service_bytes); 5088a3d2615STejun Heo blkg_rwstat_reset(&sc->serviced); 5098a3d2615STejun Heo } 510a29a171eSVivek Goyal } 511a29a171eSVivek Goyal 5123c798398STejun Heo static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 5133c798398STejun Heo struct blkcg *blkcg) 514e43473b7SVivek Goyal { 515e43473b7SVivek Goyal /* 5163c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 5173c798398STejun Heo * in this case 518be2c6b19SVivek Goyal */ 5193c798398STejun Heo if (blkcg == &blkcg_root) 52003d8e111STejun Heo return td_root_tg(td); 521e43473b7SVivek Goyal 522e8989faeSTejun Heo return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 523e43473b7SVivek Goyal } 524e43473b7SVivek Goyal 525cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 5263c798398STejun Heo struct blkcg *blkcg) 527e43473b7SVivek Goyal { 528f469a7b4SVivek Goyal struct request_queue *q = td->queue; 529cd1604faSTejun Heo struct throtl_grp *tg = NULL; 5300a5a7d0eSTejun Heo 531f469a7b4SVivek Goyal /* 5323c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 5333c798398STejun Heo * in this case 534f469a7b4SVivek Goyal */ 5353c798398STejun Heo if (blkcg == &blkcg_root) { 53603d8e111STejun Heo tg = td_root_tg(td); 537cd1604faSTejun Heo } else { 5383c798398STejun Heo struct blkcg_gq *blkg; 539cd1604faSTejun Heo 5403c96cb32STejun Heo blkg = blkg_lookup_create(blkcg, q); 541cd1604faSTejun Heo 542cd1604faSTejun Heo /* if %NULL and @q is alive, fall back to root_tg */ 543cd1604faSTejun Heo if (!IS_ERR(blkg)) 5440381411eSTejun Heo tg = blkg_to_tg(blkg); 5453f3299d5SBart Van Assche else if (!blk_queue_dying(q)) 54603d8e111STejun Heo tg = td_root_tg(td); 547f469a7b4SVivek Goyal } 548f469a7b4SVivek Goyal 549e43473b7SVivek Goyal return tg; 550e43473b7SVivek Goyal } 551e43473b7SVivek Goyal 5520049af73STejun Heo static struct throtl_grp * 5530049af73STejun Heo throtl_rb_first(struct throtl_service_queue *parent_sq) 554e43473b7SVivek Goyal { 555e43473b7SVivek Goyal /* Service tree is empty */ 5560049af73STejun Heo if (!parent_sq->nr_pending) 557e43473b7SVivek Goyal return NULL; 558e43473b7SVivek Goyal 5590049af73STejun Heo if (!parent_sq->first_pending) 5600049af73STejun Heo parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 561e43473b7SVivek Goyal 5620049af73STejun Heo if (parent_sq->first_pending) 5630049af73STejun Heo return rb_entry_tg(parent_sq->first_pending); 564e43473b7SVivek Goyal 565e43473b7SVivek Goyal return NULL; 566e43473b7SVivek Goyal } 567e43473b7SVivek Goyal 568e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 569e43473b7SVivek Goyal { 570e43473b7SVivek Goyal rb_erase(n, root); 571e43473b7SVivek Goyal RB_CLEAR_NODE(n); 572e43473b7SVivek Goyal } 573e43473b7SVivek Goyal 5740049af73STejun Heo static void throtl_rb_erase(struct rb_node *n, 5750049af73STejun Heo struct throtl_service_queue *parent_sq) 576e43473b7SVivek Goyal { 5770049af73STejun Heo if (parent_sq->first_pending == n) 5780049af73STejun Heo parent_sq->first_pending = NULL; 5790049af73STejun Heo rb_erase_init(n, &parent_sq->pending_tree); 5800049af73STejun Heo --parent_sq->nr_pending; 581e43473b7SVivek Goyal } 582e43473b7SVivek Goyal 5830049af73STejun Heo static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 584e43473b7SVivek Goyal { 585e43473b7SVivek Goyal struct throtl_grp *tg; 586e43473b7SVivek Goyal 5870049af73STejun Heo tg = throtl_rb_first(parent_sq); 588e43473b7SVivek Goyal if (!tg) 589e43473b7SVivek Goyal return; 590e43473b7SVivek Goyal 5910049af73STejun Heo parent_sq->first_pending_disptime = tg->disptime; 592e43473b7SVivek Goyal } 593e43473b7SVivek Goyal 59477216b04STejun Heo static void tg_service_queue_add(struct throtl_grp *tg) 595e43473b7SVivek Goyal { 59677216b04STejun Heo struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 5970049af73STejun Heo struct rb_node **node = &parent_sq->pending_tree.rb_node; 598e43473b7SVivek Goyal struct rb_node *parent = NULL; 599e43473b7SVivek Goyal struct throtl_grp *__tg; 600e43473b7SVivek Goyal unsigned long key = tg->disptime; 601e43473b7SVivek Goyal int left = 1; 602e43473b7SVivek Goyal 603e43473b7SVivek Goyal while (*node != NULL) { 604e43473b7SVivek Goyal parent = *node; 605e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 606e43473b7SVivek Goyal 607e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 608e43473b7SVivek Goyal node = &parent->rb_left; 609e43473b7SVivek Goyal else { 610e43473b7SVivek Goyal node = &parent->rb_right; 611e43473b7SVivek Goyal left = 0; 612e43473b7SVivek Goyal } 613e43473b7SVivek Goyal } 614e43473b7SVivek Goyal 615e43473b7SVivek Goyal if (left) 6160049af73STejun Heo parent_sq->first_pending = &tg->rb_node; 617e43473b7SVivek Goyal 618e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 6190049af73STejun Heo rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 620e43473b7SVivek Goyal } 621e43473b7SVivek Goyal 62277216b04STejun Heo static void __throtl_enqueue_tg(struct throtl_grp *tg) 623e43473b7SVivek Goyal { 62477216b04STejun Heo tg_service_queue_add(tg); 6255b2c16aaSTejun Heo tg->flags |= THROTL_TG_PENDING; 62677216b04STejun Heo tg->service_queue.parent_sq->nr_pending++; 627e43473b7SVivek Goyal } 628e43473b7SVivek Goyal 62977216b04STejun Heo static void throtl_enqueue_tg(struct throtl_grp *tg) 630e43473b7SVivek Goyal { 6315b2c16aaSTejun Heo if (!(tg->flags & THROTL_TG_PENDING)) 63277216b04STejun Heo __throtl_enqueue_tg(tg); 633e43473b7SVivek Goyal } 634e43473b7SVivek Goyal 63577216b04STejun Heo static void __throtl_dequeue_tg(struct throtl_grp *tg) 636e43473b7SVivek Goyal { 63777216b04STejun Heo throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 6385b2c16aaSTejun Heo tg->flags &= ~THROTL_TG_PENDING; 639e43473b7SVivek Goyal } 640e43473b7SVivek Goyal 64177216b04STejun Heo static void throtl_dequeue_tg(struct throtl_grp *tg) 642e43473b7SVivek Goyal { 6435b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) 64477216b04STejun Heo __throtl_dequeue_tg(tg); 645e43473b7SVivek Goyal } 646e43473b7SVivek Goyal 647a9131a27STejun Heo /* Call with queue lock held */ 64869df0ab0STejun Heo static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 64969df0ab0STejun Heo unsigned long expires) 650a9131a27STejun Heo { 65169df0ab0STejun Heo mod_timer(&sq->pending_timer, expires); 65269df0ab0STejun Heo throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 65369df0ab0STejun Heo expires - jiffies, jiffies); 654a9131a27STejun Heo } 655a9131a27STejun Heo 6567f52f98cSTejun Heo /** 6577f52f98cSTejun Heo * throtl_schedule_next_dispatch - schedule the next dispatch cycle 6587f52f98cSTejun Heo * @sq: the service_queue to schedule dispatch for 6597f52f98cSTejun Heo * @force: force scheduling 6607f52f98cSTejun Heo * 6617f52f98cSTejun Heo * Arm @sq->pending_timer so that the next dispatch cycle starts on the 6627f52f98cSTejun Heo * dispatch time of the first pending child. Returns %true if either timer 6637f52f98cSTejun Heo * is armed or there's no pending child left. %false if the current 6647f52f98cSTejun Heo * dispatch window is still open and the caller should continue 6657f52f98cSTejun Heo * dispatching. 6667f52f98cSTejun Heo * 6677f52f98cSTejun Heo * If @force is %true, the dispatch timer is always scheduled and this 6687f52f98cSTejun Heo * function is guaranteed to return %true. This is to be used when the 6697f52f98cSTejun Heo * caller can't dispatch itself and needs to invoke pending_timer 6707f52f98cSTejun Heo * unconditionally. Note that forced scheduling is likely to induce short 6717f52f98cSTejun Heo * delay before dispatch starts even if @sq->first_pending_disptime is not 6727f52f98cSTejun Heo * in the future and thus shouldn't be used in hot paths. 6737f52f98cSTejun Heo */ 6747f52f98cSTejun Heo static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 6757f52f98cSTejun Heo bool force) 676e43473b7SVivek Goyal { 6776a525600STejun Heo /* any pending children left? */ 678c9e0332eSTejun Heo if (!sq->nr_pending) 6797f52f98cSTejun Heo return true; 680e43473b7SVivek Goyal 681c9e0332eSTejun Heo update_min_dispatch_time(sq); 682e43473b7SVivek Goyal 68369df0ab0STejun Heo /* is the next dispatch time in the future? */ 6847f52f98cSTejun Heo if (force || time_after(sq->first_pending_disptime, jiffies)) { 68569df0ab0STejun Heo throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 6867f52f98cSTejun Heo return true; 68769df0ab0STejun Heo } 68869df0ab0STejun Heo 6897f52f98cSTejun Heo /* tell the caller to continue dispatching */ 6907f52f98cSTejun Heo return false; 691e43473b7SVivek Goyal } 692e43473b7SVivek Goyal 69332ee5bc4SVivek Goyal static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 69432ee5bc4SVivek Goyal bool rw, unsigned long start) 69532ee5bc4SVivek Goyal { 69632ee5bc4SVivek Goyal tg->bytes_disp[rw] = 0; 69732ee5bc4SVivek Goyal tg->io_disp[rw] = 0; 69832ee5bc4SVivek Goyal 69932ee5bc4SVivek Goyal /* 70032ee5bc4SVivek Goyal * Previous slice has expired. We must have trimmed it after last 70132ee5bc4SVivek Goyal * bio dispatch. That means since start of last slice, we never used 70232ee5bc4SVivek Goyal * that bandwidth. Do try to make use of that bandwidth while giving 70332ee5bc4SVivek Goyal * credit. 70432ee5bc4SVivek Goyal */ 70532ee5bc4SVivek Goyal if (time_after_eq(start, tg->slice_start[rw])) 70632ee5bc4SVivek Goyal tg->slice_start[rw] = start; 70732ee5bc4SVivek Goyal 70832ee5bc4SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 70932ee5bc4SVivek Goyal throtl_log(&tg->service_queue, 71032ee5bc4SVivek Goyal "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 71132ee5bc4SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 71232ee5bc4SVivek Goyal tg->slice_end[rw], jiffies); 71332ee5bc4SVivek Goyal } 71432ee5bc4SVivek Goyal 7150f3457f6STejun Heo static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 716e43473b7SVivek Goyal { 717e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 7188e89d13fSVivek Goyal tg->io_disp[rw] = 0; 719e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 720e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 721fda6f272STejun Heo throtl_log(&tg->service_queue, 722fda6f272STejun Heo "[%c] new slice start=%lu end=%lu jiffies=%lu", 723e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 724e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 725e43473b7SVivek Goyal } 726e43473b7SVivek Goyal 7270f3457f6STejun Heo static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 7280f3457f6STejun Heo unsigned long jiffy_end) 729d1ae8ffdSVivek Goyal { 730d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 731d1ae8ffdSVivek Goyal } 732d1ae8ffdSVivek Goyal 7330f3457f6STejun Heo static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 7340f3457f6STejun Heo unsigned long jiffy_end) 735e43473b7SVivek Goyal { 736e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 737fda6f272STejun Heo throtl_log(&tg->service_queue, 738fda6f272STejun Heo "[%c] extend slice start=%lu end=%lu jiffies=%lu", 739e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 740e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 741e43473b7SVivek Goyal } 742e43473b7SVivek Goyal 743e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 7440f3457f6STejun Heo static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 745e43473b7SVivek Goyal { 746e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 747e43473b7SVivek Goyal return 0; 748e43473b7SVivek Goyal 749e43473b7SVivek Goyal return 1; 750e43473b7SVivek Goyal } 751e43473b7SVivek Goyal 752e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 7530f3457f6STejun Heo static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 754e43473b7SVivek Goyal { 7553aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 7563aad5d3eSVivek Goyal u64 bytes_trim, tmp; 757e43473b7SVivek Goyal 758e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 759e43473b7SVivek Goyal 760e43473b7SVivek Goyal /* 761e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 762e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 763e43473b7SVivek Goyal * slice will start when appropriate. 764e43473b7SVivek Goyal */ 7650f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 766e43473b7SVivek Goyal return; 767e43473b7SVivek Goyal 768d1ae8ffdSVivek Goyal /* 769d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 770d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 771d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 772d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 773d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 774d1ae8ffdSVivek Goyal */ 775d1ae8ffdSVivek Goyal 7760f3457f6STejun Heo throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 777d1ae8ffdSVivek Goyal 778e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 779e43473b7SVivek Goyal 780e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 781e43473b7SVivek Goyal 782e43473b7SVivek Goyal if (!nr_slices) 783e43473b7SVivek Goyal return; 7843aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 7853aad5d3eSVivek Goyal do_div(tmp, HZ); 7863aad5d3eSVivek Goyal bytes_trim = tmp; 787e43473b7SVivek Goyal 7888e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 789e43473b7SVivek Goyal 7908e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 791e43473b7SVivek Goyal return; 792e43473b7SVivek Goyal 793e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 794e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 795e43473b7SVivek Goyal else 796e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 797e43473b7SVivek Goyal 7988e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 7998e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 8008e89d13fSVivek Goyal else 8018e89d13fSVivek Goyal tg->io_disp[rw] = 0; 8028e89d13fSVivek Goyal 803e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 804e43473b7SVivek Goyal 805fda6f272STejun Heo throtl_log(&tg->service_queue, 806fda6f272STejun Heo "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 8078e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 808e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 809e43473b7SVivek Goyal } 810e43473b7SVivek Goyal 8110f3457f6STejun Heo static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 8120f3457f6STejun Heo unsigned long *wait) 813e43473b7SVivek Goyal { 814e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 8158e89d13fSVivek Goyal unsigned int io_allowed; 816e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 817c49c06e4SVivek Goyal u64 tmp; 818e43473b7SVivek Goyal 8198e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 820e43473b7SVivek Goyal 8218e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 8228e89d13fSVivek Goyal if (!jiffy_elapsed) 8238e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 8248e89d13fSVivek Goyal 8258e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 8268e89d13fSVivek Goyal 827c49c06e4SVivek Goyal /* 828c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 829c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 830c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 831c49c06e4SVivek Goyal * have been trimmed. 832c49c06e4SVivek Goyal */ 833c49c06e4SVivek Goyal 834c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 835c49c06e4SVivek Goyal do_div(tmp, HZ); 836c49c06e4SVivek Goyal 837c49c06e4SVivek Goyal if (tmp > UINT_MAX) 838c49c06e4SVivek Goyal io_allowed = UINT_MAX; 839c49c06e4SVivek Goyal else 840c49c06e4SVivek Goyal io_allowed = tmp; 8418e89d13fSVivek Goyal 8428e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 843e43473b7SVivek Goyal if (wait) 844e43473b7SVivek Goyal *wait = 0; 845e43473b7SVivek Goyal return 1; 846e43473b7SVivek Goyal } 847e43473b7SVivek Goyal 8488e89d13fSVivek Goyal /* Calc approx time to dispatch */ 8498e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 8508e89d13fSVivek Goyal 8518e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 8528e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 8538e89d13fSVivek Goyal else 8548e89d13fSVivek Goyal jiffy_wait = 1; 8558e89d13fSVivek Goyal 8568e89d13fSVivek Goyal if (wait) 8578e89d13fSVivek Goyal *wait = jiffy_wait; 8588e89d13fSVivek Goyal return 0; 859e43473b7SVivek Goyal } 860e43473b7SVivek Goyal 8610f3457f6STejun Heo static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 8620f3457f6STejun Heo unsigned long *wait) 8638e89d13fSVivek Goyal { 8648e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 8653aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 8668e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 8678e89d13fSVivek Goyal 868e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 869e43473b7SVivek Goyal 870e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 871e43473b7SVivek Goyal if (!jiffy_elapsed) 872e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 873e43473b7SVivek Goyal 874e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 875e43473b7SVivek Goyal 8765e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 8775e901a2bSVivek Goyal do_div(tmp, HZ); 8783aad5d3eSVivek Goyal bytes_allowed = tmp; 879e43473b7SVivek Goyal 8804f024f37SKent Overstreet if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 881e43473b7SVivek Goyal if (wait) 882e43473b7SVivek Goyal *wait = 0; 883e43473b7SVivek Goyal return 1; 884e43473b7SVivek Goyal } 885e43473b7SVivek Goyal 886e43473b7SVivek Goyal /* Calc approx time to dispatch */ 8874f024f37SKent Overstreet extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; 888e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 889e43473b7SVivek Goyal 890e43473b7SVivek Goyal if (!jiffy_wait) 891e43473b7SVivek Goyal jiffy_wait = 1; 892e43473b7SVivek Goyal 893e43473b7SVivek Goyal /* 894e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 895e43473b7SVivek Goyal * up we did. Add that time also. 896e43473b7SVivek Goyal */ 897e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 898e43473b7SVivek Goyal if (wait) 899e43473b7SVivek Goyal *wait = jiffy_wait; 9008e89d13fSVivek Goyal return 0; 9018e89d13fSVivek Goyal } 902e43473b7SVivek Goyal 9038e89d13fSVivek Goyal /* 9048e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 9058e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 9068e89d13fSVivek Goyal */ 9070f3457f6STejun Heo static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 9080f3457f6STejun Heo unsigned long *wait) 9098e89d13fSVivek Goyal { 9108e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 9118e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 9128e89d13fSVivek Goyal 9138e89d13fSVivek Goyal /* 9148e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 9158e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 9168e89d13fSVivek Goyal * this function with a different bio if there are other bios 9178e89d13fSVivek Goyal * queued. 9188e89d13fSVivek Goyal */ 91973f0d49aSTejun Heo BUG_ON(tg->service_queue.nr_queued[rw] && 920c5cc2070STejun Heo bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 9218e89d13fSVivek Goyal 9228e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 9238e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 9248e89d13fSVivek Goyal if (wait) 9258e89d13fSVivek Goyal *wait = 0; 9268e89d13fSVivek Goyal return 1; 9278e89d13fSVivek Goyal } 9288e89d13fSVivek Goyal 9298e89d13fSVivek Goyal /* 9308e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 9318e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 9328e89d13fSVivek Goyal * long since now. 9338e89d13fSVivek Goyal */ 9340f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 9350f3457f6STejun Heo throtl_start_new_slice(tg, rw); 9368e89d13fSVivek Goyal else { 9378e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 9380f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + throtl_slice); 9398e89d13fSVivek Goyal } 9408e89d13fSVivek Goyal 9410f3457f6STejun Heo if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 9420f3457f6STejun Heo tg_with_in_iops_limit(tg, bio, &iops_wait)) { 9438e89d13fSVivek Goyal if (wait) 9448e89d13fSVivek Goyal *wait = 0; 9458e89d13fSVivek Goyal return 1; 9468e89d13fSVivek Goyal } 9478e89d13fSVivek Goyal 9488e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 9498e89d13fSVivek Goyal 9508e89d13fSVivek Goyal if (wait) 9518e89d13fSVivek Goyal *wait = max_wait; 9528e89d13fSVivek Goyal 9538e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 9540f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + max_wait); 955e43473b7SVivek Goyal 956e43473b7SVivek Goyal return 0; 957e43473b7SVivek Goyal } 958e43473b7SVivek Goyal 9593c798398STejun Heo static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, 960629ed0b1STejun Heo int rw) 961629ed0b1STejun Heo { 9628a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 9638a3d2615STejun Heo struct tg_stats_cpu *stats_cpu; 964629ed0b1STejun Heo unsigned long flags; 965629ed0b1STejun Heo 966629ed0b1STejun Heo /* If per cpu stats are not allocated yet, don't do any accounting. */ 9678a3d2615STejun Heo if (tg->stats_cpu == NULL) 968629ed0b1STejun Heo return; 969629ed0b1STejun Heo 970629ed0b1STejun Heo /* 971629ed0b1STejun Heo * Disabling interrupts to provide mutual exclusion between two 972629ed0b1STejun Heo * writes on same cpu. It probably is not needed for 64bit. Not 973629ed0b1STejun Heo * optimizing that case yet. 974629ed0b1STejun Heo */ 975629ed0b1STejun Heo local_irq_save(flags); 976629ed0b1STejun Heo 9778a3d2615STejun Heo stats_cpu = this_cpu_ptr(tg->stats_cpu); 978629ed0b1STejun Heo 979629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 980629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 981629ed0b1STejun Heo 982629ed0b1STejun Heo local_irq_restore(flags); 983629ed0b1STejun Heo } 984629ed0b1STejun Heo 985e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 986e43473b7SVivek Goyal { 987e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 988e43473b7SVivek Goyal 989e43473b7SVivek Goyal /* Charge the bio to the group */ 9904f024f37SKent Overstreet tg->bytes_disp[rw] += bio->bi_iter.bi_size; 9918e89d13fSVivek Goyal tg->io_disp[rw]++; 992e43473b7SVivek Goyal 9932a0f61e6STejun Heo /* 9942a0f61e6STejun Heo * REQ_THROTTLED is used to prevent the same bio to be throttled 9952a0f61e6STejun Heo * more than once as a throttled bio will go through blk-throtl the 9962a0f61e6STejun Heo * second time when it eventually gets issued. Set it when a bio 9972a0f61e6STejun Heo * is being charged to a tg. 9982a0f61e6STejun Heo * 9992a0f61e6STejun Heo * Dispatch stats aren't recursive and each @bio should only be 10002a0f61e6STejun Heo * accounted by the @tg it was originally associated with. Let's 10012a0f61e6STejun Heo * update the stats when setting REQ_THROTTLED for the first time 10022a0f61e6STejun Heo * which is guaranteed to be for the @bio's original tg. 10032a0f61e6STejun Heo */ 10042a0f61e6STejun Heo if (!(bio->bi_rw & REQ_THROTTLED)) { 10052a0f61e6STejun Heo bio->bi_rw |= REQ_THROTTLED; 10064f024f37SKent Overstreet throtl_update_dispatch_stats(tg_to_blkg(tg), 10074f024f37SKent Overstreet bio->bi_iter.bi_size, bio->bi_rw); 10082a0f61e6STejun Heo } 1009e43473b7SVivek Goyal } 1010e43473b7SVivek Goyal 1011c5cc2070STejun Heo /** 1012c5cc2070STejun Heo * throtl_add_bio_tg - add a bio to the specified throtl_grp 1013c5cc2070STejun Heo * @bio: bio to add 1014c5cc2070STejun Heo * @qn: qnode to use 1015c5cc2070STejun Heo * @tg: the target throtl_grp 1016c5cc2070STejun Heo * 1017c5cc2070STejun Heo * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 1018c5cc2070STejun Heo * tg->qnode_on_self[] is used. 1019c5cc2070STejun Heo */ 1020c5cc2070STejun Heo static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 1021c5cc2070STejun Heo struct throtl_grp *tg) 1022e43473b7SVivek Goyal { 102373f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1024e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 1025e43473b7SVivek Goyal 1026c5cc2070STejun Heo if (!qn) 1027c5cc2070STejun Heo qn = &tg->qnode_on_self[rw]; 1028c5cc2070STejun Heo 10290e9f4164STejun Heo /* 10300e9f4164STejun Heo * If @tg doesn't currently have any bios queued in the same 10310e9f4164STejun Heo * direction, queueing @bio can change when @tg should be 10320e9f4164STejun Heo * dispatched. Mark that @tg was empty. This is automatically 10330e9f4164STejun Heo * cleaered on the next tg_update_disptime(). 10340e9f4164STejun Heo */ 10350e9f4164STejun Heo if (!sq->nr_queued[rw]) 10360e9f4164STejun Heo tg->flags |= THROTL_TG_WAS_EMPTY; 10370e9f4164STejun Heo 1038c5cc2070STejun Heo throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 1039c5cc2070STejun Heo 104073f0d49aSTejun Heo sq->nr_queued[rw]++; 104177216b04STejun Heo throtl_enqueue_tg(tg); 1042e43473b7SVivek Goyal } 1043e43473b7SVivek Goyal 104477216b04STejun Heo static void tg_update_disptime(struct throtl_grp *tg) 1045e43473b7SVivek Goyal { 104673f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1047e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1048e43473b7SVivek Goyal struct bio *bio; 1049e43473b7SVivek Goyal 1050c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[READ]))) 10510f3457f6STejun Heo tg_may_dispatch(tg, bio, &read_wait); 1052e43473b7SVivek Goyal 1053c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 10540f3457f6STejun Heo tg_may_dispatch(tg, bio, &write_wait); 1055e43473b7SVivek Goyal 1056e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 1057e43473b7SVivek Goyal disptime = jiffies + min_wait; 1058e43473b7SVivek Goyal 1059e43473b7SVivek Goyal /* Update dispatch time */ 106077216b04STejun Heo throtl_dequeue_tg(tg); 1061e43473b7SVivek Goyal tg->disptime = disptime; 106277216b04STejun Heo throtl_enqueue_tg(tg); 10630e9f4164STejun Heo 10640e9f4164STejun Heo /* see throtl_add_bio_tg() */ 10650e9f4164STejun Heo tg->flags &= ~THROTL_TG_WAS_EMPTY; 1066e43473b7SVivek Goyal } 1067e43473b7SVivek Goyal 106832ee5bc4SVivek Goyal static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 106932ee5bc4SVivek Goyal struct throtl_grp *parent_tg, bool rw) 107032ee5bc4SVivek Goyal { 107132ee5bc4SVivek Goyal if (throtl_slice_used(parent_tg, rw)) { 107232ee5bc4SVivek Goyal throtl_start_new_slice_with_credit(parent_tg, rw, 107332ee5bc4SVivek Goyal child_tg->slice_start[rw]); 107432ee5bc4SVivek Goyal } 107532ee5bc4SVivek Goyal 107632ee5bc4SVivek Goyal } 107732ee5bc4SVivek Goyal 107877216b04STejun Heo static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 1079e43473b7SVivek Goyal { 108073f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 10816bc9c2b4STejun Heo struct throtl_service_queue *parent_sq = sq->parent_sq; 10826bc9c2b4STejun Heo struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1083c5cc2070STejun Heo struct throtl_grp *tg_to_put = NULL; 1084e43473b7SVivek Goyal struct bio *bio; 1085e43473b7SVivek Goyal 1086c5cc2070STejun Heo /* 1087c5cc2070STejun Heo * @bio is being transferred from @tg to @parent_sq. Popping a bio 1088c5cc2070STejun Heo * from @tg may put its reference and @parent_sq might end up 1089c5cc2070STejun Heo * getting released prematurely. Remember the tg to put and put it 1090c5cc2070STejun Heo * after @bio is transferred to @parent_sq. 1091c5cc2070STejun Heo */ 1092c5cc2070STejun Heo bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 109373f0d49aSTejun Heo sq->nr_queued[rw]--; 1094e43473b7SVivek Goyal 1095e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 10966bc9c2b4STejun Heo 10976bc9c2b4STejun Heo /* 10986bc9c2b4STejun Heo * If our parent is another tg, we just need to transfer @bio to 10996bc9c2b4STejun Heo * the parent using throtl_add_bio_tg(). If our parent is 11006bc9c2b4STejun Heo * @td->service_queue, @bio is ready to be issued. Put it on its 11016bc9c2b4STejun Heo * bio_lists[] and decrease total number queued. The caller is 11026bc9c2b4STejun Heo * responsible for issuing these bios. 11036bc9c2b4STejun Heo */ 11046bc9c2b4STejun Heo if (parent_tg) { 1105c5cc2070STejun Heo throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 110632ee5bc4SVivek Goyal start_parent_slice_with_credit(tg, parent_tg, rw); 11076bc9c2b4STejun Heo } else { 1108c5cc2070STejun Heo throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1109c5cc2070STejun Heo &parent_sq->queued[rw]); 11106bc9c2b4STejun Heo BUG_ON(tg->td->nr_queued[rw] <= 0); 11116bc9c2b4STejun Heo tg->td->nr_queued[rw]--; 11126bc9c2b4STejun Heo } 1113e43473b7SVivek Goyal 11140f3457f6STejun Heo throtl_trim_slice(tg, rw); 11156bc9c2b4STejun Heo 1116c5cc2070STejun Heo if (tg_to_put) 1117c5cc2070STejun Heo blkg_put(tg_to_blkg(tg_to_put)); 1118e43473b7SVivek Goyal } 1119e43473b7SVivek Goyal 112077216b04STejun Heo static int throtl_dispatch_tg(struct throtl_grp *tg) 1121e43473b7SVivek Goyal { 112273f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1123e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 1124e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1125c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1126e43473b7SVivek Goyal struct bio *bio; 1127e43473b7SVivek Goyal 1128e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 1129e43473b7SVivek Goyal 1130c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ])) && 11310f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 1132e43473b7SVivek Goyal 113377216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1134e43473b7SVivek Goyal nr_reads++; 1135e43473b7SVivek Goyal 1136e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 1137e43473b7SVivek Goyal break; 1138e43473b7SVivek Goyal } 1139e43473b7SVivek Goyal 1140c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 11410f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 1142e43473b7SVivek Goyal 114377216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1144e43473b7SVivek Goyal nr_writes++; 1145e43473b7SVivek Goyal 1146e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 1147e43473b7SVivek Goyal break; 1148e43473b7SVivek Goyal } 1149e43473b7SVivek Goyal 1150e43473b7SVivek Goyal return nr_reads + nr_writes; 1151e43473b7SVivek Goyal } 1152e43473b7SVivek Goyal 1153651930bcSTejun Heo static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1154e43473b7SVivek Goyal { 1155e43473b7SVivek Goyal unsigned int nr_disp = 0; 1156e43473b7SVivek Goyal 1157e43473b7SVivek Goyal while (1) { 115873f0d49aSTejun Heo struct throtl_grp *tg = throtl_rb_first(parent_sq); 115973f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1160e43473b7SVivek Goyal 1161e43473b7SVivek Goyal if (!tg) 1162e43473b7SVivek Goyal break; 1163e43473b7SVivek Goyal 1164e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 1165e43473b7SVivek Goyal break; 1166e43473b7SVivek Goyal 116777216b04STejun Heo throtl_dequeue_tg(tg); 1168e43473b7SVivek Goyal 116977216b04STejun Heo nr_disp += throtl_dispatch_tg(tg); 1170e43473b7SVivek Goyal 117173f0d49aSTejun Heo if (sq->nr_queued[0] || sq->nr_queued[1]) 117277216b04STejun Heo tg_update_disptime(tg); 1173e43473b7SVivek Goyal 1174e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 1175e43473b7SVivek Goyal break; 1176e43473b7SVivek Goyal } 1177e43473b7SVivek Goyal 1178e43473b7SVivek Goyal return nr_disp; 1179e43473b7SVivek Goyal } 1180e43473b7SVivek Goyal 11816e1a5704STejun Heo /** 11826e1a5704STejun Heo * throtl_pending_timer_fn - timer function for service_queue->pending_timer 11836e1a5704STejun Heo * @arg: the throtl_service_queue being serviced 11846e1a5704STejun Heo * 11856e1a5704STejun Heo * This timer is armed when a child throtl_grp with active bio's become 11866e1a5704STejun Heo * pending and queued on the service_queue's pending_tree and expires when 11876e1a5704STejun Heo * the first child throtl_grp should be dispatched. This function 11882e48a530STejun Heo * dispatches bio's from the children throtl_grps to the parent 11892e48a530STejun Heo * service_queue. 11902e48a530STejun Heo * 11912e48a530STejun Heo * If the parent's parent is another throtl_grp, dispatching is propagated 11922e48a530STejun Heo * by either arming its pending_timer or repeating dispatch directly. If 11932e48a530STejun Heo * the top-level service_tree is reached, throtl_data->dispatch_work is 11942e48a530STejun Heo * kicked so that the ready bio's are issued. 11956e1a5704STejun Heo */ 119669df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg) 119769df0ab0STejun Heo { 119869df0ab0STejun Heo struct throtl_service_queue *sq = (void *)arg; 11992e48a530STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 120069df0ab0STejun Heo struct throtl_data *td = sq_to_td(sq); 1201cb76199cSTejun Heo struct request_queue *q = td->queue; 12022e48a530STejun Heo struct throtl_service_queue *parent_sq; 12032e48a530STejun Heo bool dispatched; 12046e1a5704STejun Heo int ret; 1205e43473b7SVivek Goyal 1206e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 12072e48a530STejun Heo again: 12082e48a530STejun Heo parent_sq = sq->parent_sq; 12092e48a530STejun Heo dispatched = false; 1210e43473b7SVivek Goyal 12117f52f98cSTejun Heo while (true) { 1212fda6f272STejun Heo throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 12132e48a530STejun Heo sq->nr_queued[READ] + sq->nr_queued[WRITE], 12142e48a530STejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1215e43473b7SVivek Goyal 12167f52f98cSTejun Heo ret = throtl_select_dispatch(sq); 12177f52f98cSTejun Heo if (ret) { 12187f52f98cSTejun Heo throtl_log(sq, "bios disp=%u", ret); 12197f52f98cSTejun Heo dispatched = true; 1220651930bcSTejun Heo } 1221e43473b7SVivek Goyal 12227f52f98cSTejun Heo if (throtl_schedule_next_dispatch(sq, false)) 12237f52f98cSTejun Heo break; 12247f52f98cSTejun Heo 12257f52f98cSTejun Heo /* this dispatch windows is still open, relax and repeat */ 12267f52f98cSTejun Heo spin_unlock_irq(q->queue_lock); 12277f52f98cSTejun Heo cpu_relax(); 12287f52f98cSTejun Heo spin_lock_irq(q->queue_lock); 12297f52f98cSTejun Heo } 12306a525600STejun Heo 12312e48a530STejun Heo if (!dispatched) 12322e48a530STejun Heo goto out_unlock; 12336e1a5704STejun Heo 12342e48a530STejun Heo if (parent_sq) { 12352e48a530STejun Heo /* @parent_sq is another throl_grp, propagate dispatch */ 12362e48a530STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 12372e48a530STejun Heo tg_update_disptime(tg); 12382e48a530STejun Heo if (!throtl_schedule_next_dispatch(parent_sq, false)) { 12392e48a530STejun Heo /* window is already open, repeat dispatching */ 12402e48a530STejun Heo sq = parent_sq; 12412e48a530STejun Heo tg = sq_to_tg(sq); 12422e48a530STejun Heo goto again; 12432e48a530STejun Heo } 12442e48a530STejun Heo } 12452e48a530STejun Heo } else { 12462e48a530STejun Heo /* reached the top-level, queue issueing */ 12472e48a530STejun Heo queue_work(kthrotld_workqueue, &td->dispatch_work); 12482e48a530STejun Heo } 12492e48a530STejun Heo out_unlock: 12506e1a5704STejun Heo spin_unlock_irq(q->queue_lock); 12516e1a5704STejun Heo } 12526e1a5704STejun Heo 12536e1a5704STejun Heo /** 12546e1a5704STejun Heo * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 12556e1a5704STejun Heo * @work: work item being executed 12566e1a5704STejun Heo * 12576e1a5704STejun Heo * This function is queued for execution when bio's reach the bio_lists[] 12586e1a5704STejun Heo * of throtl_data->service_queue. Those bio's are ready and issued by this 12596e1a5704STejun Heo * function. 12606e1a5704STejun Heo */ 12616e1a5704STejun Heo void blk_throtl_dispatch_work_fn(struct work_struct *work) 12626e1a5704STejun Heo { 12636e1a5704STejun Heo struct throtl_data *td = container_of(work, struct throtl_data, 12646e1a5704STejun Heo dispatch_work); 12656e1a5704STejun Heo struct throtl_service_queue *td_sq = &td->service_queue; 12666e1a5704STejun Heo struct request_queue *q = td->queue; 12676e1a5704STejun Heo struct bio_list bio_list_on_stack; 12686e1a5704STejun Heo struct bio *bio; 12696e1a5704STejun Heo struct blk_plug plug; 12706e1a5704STejun Heo int rw; 12716e1a5704STejun Heo 12726e1a5704STejun Heo bio_list_init(&bio_list_on_stack); 12736e1a5704STejun Heo 12746e1a5704STejun Heo spin_lock_irq(q->queue_lock); 1275c5cc2070STejun Heo for (rw = READ; rw <= WRITE; rw++) 1276c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1277c5cc2070STejun Heo bio_list_add(&bio_list_on_stack, bio); 1278e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1279e43473b7SVivek Goyal 12806e1a5704STejun Heo if (!bio_list_empty(&bio_list_on_stack)) { 128169d60eb9SVivek Goyal blk_start_plug(&plug); 1282e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 1283e43473b7SVivek Goyal generic_make_request(bio); 128469d60eb9SVivek Goyal blk_finish_plug(&plug); 1285e43473b7SVivek Goyal } 1286e43473b7SVivek Goyal } 1287e43473b7SVivek Goyal 1288f95a04afSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 1289f95a04afSTejun Heo struct blkg_policy_data *pd, int off) 129041b38b6dSTejun Heo { 1291f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 129241b38b6dSTejun Heo struct blkg_rwstat rwstat = { }, tmp; 129341b38b6dSTejun Heo int i, cpu; 129441b38b6dSTejun Heo 129541b38b6dSTejun Heo for_each_possible_cpu(cpu) { 12968a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 129741b38b6dSTejun Heo 129841b38b6dSTejun Heo tmp = blkg_rwstat_read((void *)sc + off); 129941b38b6dSTejun Heo for (i = 0; i < BLKG_RWSTAT_NR; i++) 130041b38b6dSTejun Heo rwstat.cnt[i] += tmp.cnt[i]; 130141b38b6dSTejun Heo } 130241b38b6dSTejun Heo 1303f95a04afSTejun Heo return __blkg_prfill_rwstat(sf, pd, &rwstat); 130441b38b6dSTejun Heo } 130541b38b6dSTejun Heo 13062da8ca82STejun Heo static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) 130741b38b6dSTejun Heo { 13082da8ca82STejun Heo blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, 13092da8ca82STejun Heo &blkcg_policy_throtl, seq_cft(sf)->private, true); 131041b38b6dSTejun Heo return 0; 131141b38b6dSTejun Heo } 131241b38b6dSTejun Heo 1313f95a04afSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1314f95a04afSTejun Heo int off) 131560c2bc2dSTejun Heo { 1316f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1317f95a04afSTejun Heo u64 v = *(u64 *)((void *)tg + off); 131860c2bc2dSTejun Heo 1319af133cebSTejun Heo if (v == -1) 132060c2bc2dSTejun Heo return 0; 1321f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 132260c2bc2dSTejun Heo } 132360c2bc2dSTejun Heo 1324f95a04afSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1325f95a04afSTejun Heo int off) 1326af133cebSTejun Heo { 1327f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1328f95a04afSTejun Heo unsigned int v = *(unsigned int *)((void *)tg + off); 1329af133cebSTejun Heo 1330af133cebSTejun Heo if (v == -1) 1331af133cebSTejun Heo return 0; 1332f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 1333af133cebSTejun Heo } 1334af133cebSTejun Heo 13352da8ca82STejun Heo static int tg_print_conf_u64(struct seq_file *sf, void *v) 133660c2bc2dSTejun Heo { 13372da8ca82STejun Heo blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 13382da8ca82STejun Heo &blkcg_policy_throtl, seq_cft(sf)->private, false); 133960c2bc2dSTejun Heo return 0; 134060c2bc2dSTejun Heo } 134160c2bc2dSTejun Heo 13422da8ca82STejun Heo static int tg_print_conf_uint(struct seq_file *sf, void *v) 1343e43473b7SVivek Goyal { 13442da8ca82STejun Heo blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 13452da8ca82STejun Heo &blkcg_policy_throtl, seq_cft(sf)->private, false); 1346af133cebSTejun Heo return 0; 1347e43473b7SVivek Goyal } 1348e43473b7SVivek Goyal 1349*451af504STejun Heo static ssize_t tg_set_conf(struct kernfs_open_file *of, 1350*451af504STejun Heo char *buf, size_t nbytes, loff_t off, bool is_u64) 135160c2bc2dSTejun Heo { 1352*451af504STejun Heo struct blkcg *blkcg = css_to_blkcg(of_css(of)); 135360c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 1354af133cebSTejun Heo struct throtl_grp *tg; 135569df0ab0STejun Heo struct throtl_service_queue *sq; 1356693e751eSTejun Heo struct blkcg_gq *blkg; 1357492eb21bSTejun Heo struct cgroup_subsys_state *pos_css; 135860c2bc2dSTejun Heo int ret; 135960c2bc2dSTejun Heo 13603c798398STejun Heo ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 136160c2bc2dSTejun Heo if (ret) 136260c2bc2dSTejun Heo return ret; 136360c2bc2dSTejun Heo 1364af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 136569df0ab0STejun Heo sq = &tg->service_queue; 1366af133cebSTejun Heo 1367af133cebSTejun Heo if (!ctx.v) 1368af133cebSTejun Heo ctx.v = -1; 1369af133cebSTejun Heo 1370af133cebSTejun Heo if (is_u64) 1371*451af504STejun Heo *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; 1372af133cebSTejun Heo else 1373*451af504STejun Heo *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; 1374af133cebSTejun Heo 1375fda6f272STejun Heo throtl_log(&tg->service_queue, 1376fda6f272STejun Heo "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1377632b4493STejun Heo tg->bps[READ], tg->bps[WRITE], 1378632b4493STejun Heo tg->iops[READ], tg->iops[WRITE]); 1379632b4493STejun Heo 1380632b4493STejun Heo /* 1381693e751eSTejun Heo * Update has_rules[] flags for the updated tg's subtree. A tg is 1382693e751eSTejun Heo * considered to have rules if either the tg itself or any of its 1383693e751eSTejun Heo * ancestors has rules. This identifies groups without any 1384693e751eSTejun Heo * restrictions in the whole hierarchy and allows them to bypass 1385693e751eSTejun Heo * blk-throttle. 1386693e751eSTejun Heo */ 1387492eb21bSTejun Heo blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) 1388693e751eSTejun Heo tg_update_has_rules(blkg_to_tg(blkg)); 1389693e751eSTejun Heo 1390693e751eSTejun Heo /* 1391632b4493STejun Heo * We're already holding queue_lock and know @tg is valid. Let's 1392632b4493STejun Heo * apply the new config directly. 1393632b4493STejun Heo * 1394632b4493STejun Heo * Restart the slices for both READ and WRITES. It might happen 1395632b4493STejun Heo * that a group's limit are dropped suddenly and we don't want to 1396632b4493STejun Heo * account recently dispatched IO with new low rate. 1397632b4493STejun Heo */ 13980f3457f6STejun Heo throtl_start_new_slice(tg, 0); 13990f3457f6STejun Heo throtl_start_new_slice(tg, 1); 1400632b4493STejun Heo 14015b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) { 140277216b04STejun Heo tg_update_disptime(tg); 14037f52f98cSTejun Heo throtl_schedule_next_dispatch(sq->parent_sq, true); 1404632b4493STejun Heo } 1405af133cebSTejun Heo 140660c2bc2dSTejun Heo blkg_conf_finish(&ctx); 1407*451af504STejun Heo return nbytes; 140860c2bc2dSTejun Heo } 140960c2bc2dSTejun Heo 1410*451af504STejun Heo static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1411*451af504STejun Heo char *buf, size_t nbytes, loff_t off) 141260c2bc2dSTejun Heo { 1413*451af504STejun Heo return tg_set_conf(of, buf, nbytes, off, true); 141460c2bc2dSTejun Heo } 141560c2bc2dSTejun Heo 1416*451af504STejun Heo static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1417*451af504STejun Heo char *buf, size_t nbytes, loff_t off) 141860c2bc2dSTejun Heo { 1419*451af504STejun Heo return tg_set_conf(of, buf, nbytes, off, false); 142060c2bc2dSTejun Heo } 142160c2bc2dSTejun Heo 142260c2bc2dSTejun Heo static struct cftype throtl_files[] = { 142360c2bc2dSTejun Heo { 142460c2bc2dSTejun Heo .name = "throttle.read_bps_device", 1425af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 14262da8ca82STejun Heo .seq_show = tg_print_conf_u64, 1427*451af504STejun Heo .write = tg_set_conf_u64, 142860c2bc2dSTejun Heo }, 142960c2bc2dSTejun Heo { 143060c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1431af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 14322da8ca82STejun Heo .seq_show = tg_print_conf_u64, 1433*451af504STejun Heo .write = tg_set_conf_u64, 143460c2bc2dSTejun Heo }, 143560c2bc2dSTejun Heo { 143660c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1437af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 14382da8ca82STejun Heo .seq_show = tg_print_conf_uint, 1439*451af504STejun Heo .write = tg_set_conf_uint, 144060c2bc2dSTejun Heo }, 144160c2bc2dSTejun Heo { 144260c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1443af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 14442da8ca82STejun Heo .seq_show = tg_print_conf_uint, 1445*451af504STejun Heo .write = tg_set_conf_uint, 144660c2bc2dSTejun Heo }, 144760c2bc2dSTejun Heo { 144860c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 14495bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, service_bytes), 14502da8ca82STejun Heo .seq_show = tg_print_cpu_rwstat, 145160c2bc2dSTejun Heo }, 145260c2bc2dSTejun Heo { 145360c2bc2dSTejun Heo .name = "throttle.io_serviced", 14545bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, serviced), 14552da8ca82STejun Heo .seq_show = tg_print_cpu_rwstat, 145660c2bc2dSTejun Heo }, 145760c2bc2dSTejun Heo { } /* terminate */ 145860c2bc2dSTejun Heo }; 145960c2bc2dSTejun Heo 1460da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1461e43473b7SVivek Goyal { 1462e43473b7SVivek Goyal struct throtl_data *td = q->td; 1463e43473b7SVivek Goyal 146469df0ab0STejun Heo cancel_work_sync(&td->dispatch_work); 1465e43473b7SVivek Goyal } 1466e43473b7SVivek Goyal 14673c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl = { 1468f9fcc2d3STejun Heo .pd_size = sizeof(struct throtl_grp), 1469f9fcc2d3STejun Heo .cftypes = throtl_files, 1470f9fcc2d3STejun Heo 14713c798398STejun Heo .pd_init_fn = throtl_pd_init, 1472693e751eSTejun Heo .pd_online_fn = throtl_pd_online, 14733c798398STejun Heo .pd_exit_fn = throtl_pd_exit, 14743c798398STejun Heo .pd_reset_stats_fn = throtl_pd_reset_stats, 1475e43473b7SVivek Goyal }; 1476e43473b7SVivek Goyal 1477bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1478e43473b7SVivek Goyal { 1479e43473b7SVivek Goyal struct throtl_data *td = q->td; 1480c5cc2070STejun Heo struct throtl_qnode *qn = NULL; 1481e43473b7SVivek Goyal struct throtl_grp *tg; 148273f0d49aSTejun Heo struct throtl_service_queue *sq; 14830e9f4164STejun Heo bool rw = bio_data_dir(bio); 14843c798398STejun Heo struct blkcg *blkcg; 1485bc16a4f9STejun Heo bool throttled = false; 1486e43473b7SVivek Goyal 14872a0f61e6STejun Heo /* see throtl_charge_bio() */ 14882a0f61e6STejun Heo if (bio->bi_rw & REQ_THROTTLED) 1489bc16a4f9STejun Heo goto out; 1490e43473b7SVivek Goyal 1491af75cd3cSVivek Goyal /* 1492af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1493af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1494af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1495af75cd3cSVivek Goyal */ 1496af75cd3cSVivek Goyal rcu_read_lock(); 14973c798398STejun Heo blkcg = bio_blkcg(bio); 1498cd1604faSTejun Heo tg = throtl_lookup_tg(td, blkcg); 1499af75cd3cSVivek Goyal if (tg) { 1500693e751eSTejun Heo if (!tg->has_rules[rw]) { 1501629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), 15024f024f37SKent Overstreet bio->bi_iter.bi_size, bio->bi_rw); 15032a7f1244STejun Heo goto out_unlock_rcu; 1504af75cd3cSVivek Goyal } 1505af75cd3cSVivek Goyal } 1506af75cd3cSVivek Goyal 1507af75cd3cSVivek Goyal /* 1508af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1509af75cd3cSVivek Goyal * IO group 1510af75cd3cSVivek Goyal */ 1511e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1512cd1604faSTejun Heo tg = throtl_lookup_create_tg(td, blkcg); 1513bc16a4f9STejun Heo if (unlikely(!tg)) 1514bc16a4f9STejun Heo goto out_unlock; 1515f469a7b4SVivek Goyal 151673f0d49aSTejun Heo sq = &tg->service_queue; 151773f0d49aSTejun Heo 15189e660acfSTejun Heo while (true) { 15199e660acfSTejun Heo /* throtl is FIFO - if bios are already queued, should queue */ 15200e9f4164STejun Heo if (sq->nr_queued[rw]) 15219e660acfSTejun Heo break; 1522de701c74SVivek Goyal 15239e660acfSTejun Heo /* if above limits, break to queue */ 15249e660acfSTejun Heo if (!tg_may_dispatch(tg, bio, NULL)) 15259e660acfSTejun Heo break; 15269e660acfSTejun Heo 15279e660acfSTejun Heo /* within limits, let's charge and dispatch directly */ 1528e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 152904521db0SVivek Goyal 153004521db0SVivek Goyal /* 153104521db0SVivek Goyal * We need to trim slice even when bios are not being queued 153204521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 153304521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 153404521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 153504521db0SVivek Goyal * we take into account all the IO dispatched so far at new 153604521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 153704521db0SVivek Goyal * time. 153804521db0SVivek Goyal * 153904521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 154004521db0SVivek Goyal */ 15410f3457f6STejun Heo throtl_trim_slice(tg, rw); 15429e660acfSTejun Heo 15439e660acfSTejun Heo /* 15449e660acfSTejun Heo * @bio passed through this layer without being throttled. 15459e660acfSTejun Heo * Climb up the ladder. If we''re already at the top, it 15469e660acfSTejun Heo * can be executed directly. 15479e660acfSTejun Heo */ 1548c5cc2070STejun Heo qn = &tg->qnode_on_parent[rw]; 15499e660acfSTejun Heo sq = sq->parent_sq; 15509e660acfSTejun Heo tg = sq_to_tg(sq); 15519e660acfSTejun Heo if (!tg) 1552bc16a4f9STejun Heo goto out_unlock; 1553e43473b7SVivek Goyal } 1554e43473b7SVivek Goyal 15559e660acfSTejun Heo /* out-of-limit, queue to @tg */ 1556fda6f272STejun Heo throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 15578e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 15584f024f37SKent Overstreet tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], 15598e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 156073f0d49aSTejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1561e43473b7SVivek Goyal 1562671058fbSTejun Heo bio_associate_current(bio); 15636bc9c2b4STejun Heo tg->td->nr_queued[rw]++; 1564c5cc2070STejun Heo throtl_add_bio_tg(bio, qn, tg); 1565bc16a4f9STejun Heo throttled = true; 1566e43473b7SVivek Goyal 15677f52f98cSTejun Heo /* 15687f52f98cSTejun Heo * Update @tg's dispatch time and force schedule dispatch if @tg 15697f52f98cSTejun Heo * was empty before @bio. The forced scheduling isn't likely to 15707f52f98cSTejun Heo * cause undue delay as @bio is likely to be dispatched directly if 15717f52f98cSTejun Heo * its @tg's disptime is not in the future. 15727f52f98cSTejun Heo */ 15730e9f4164STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 157477216b04STejun Heo tg_update_disptime(tg); 15757f52f98cSTejun Heo throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1576e43473b7SVivek Goyal } 1577e43473b7SVivek Goyal 1578bc16a4f9STejun Heo out_unlock: 1579e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 15802a7f1244STejun Heo out_unlock_rcu: 15812a7f1244STejun Heo rcu_read_unlock(); 1582bc16a4f9STejun Heo out: 15832a0f61e6STejun Heo /* 15842a0f61e6STejun Heo * As multiple blk-throtls may stack in the same issue path, we 15852a0f61e6STejun Heo * don't want bios to leave with the flag set. Clear the flag if 15862a0f61e6STejun Heo * being issued. 15872a0f61e6STejun Heo */ 15882a0f61e6STejun Heo if (!throttled) 15892a0f61e6STejun Heo bio->bi_rw &= ~REQ_THROTTLED; 1590bc16a4f9STejun Heo return throttled; 1591e43473b7SVivek Goyal } 1592e43473b7SVivek Goyal 15932a12f0dcSTejun Heo /* 15942a12f0dcSTejun Heo * Dispatch all bios from all children tg's queued on @parent_sq. On 15952a12f0dcSTejun Heo * return, @parent_sq is guaranteed to not have any active children tg's 15962a12f0dcSTejun Heo * and all bios from previously active tg's are on @parent_sq->bio_lists[]. 15972a12f0dcSTejun Heo */ 15982a12f0dcSTejun Heo static void tg_drain_bios(struct throtl_service_queue *parent_sq) 15992a12f0dcSTejun Heo { 16002a12f0dcSTejun Heo struct throtl_grp *tg; 16012a12f0dcSTejun Heo 16022a12f0dcSTejun Heo while ((tg = throtl_rb_first(parent_sq))) { 16032a12f0dcSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 16042a12f0dcSTejun Heo struct bio *bio; 16052a12f0dcSTejun Heo 16062a12f0dcSTejun Heo throtl_dequeue_tg(tg); 16072a12f0dcSTejun Heo 1608c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ]))) 16092a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1610c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 16112a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 16122a12f0dcSTejun Heo } 16132a12f0dcSTejun Heo } 16142a12f0dcSTejun Heo 1615c9a929ddSTejun Heo /** 1616c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1617c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1618c9a929ddSTejun Heo * 1619c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1620c9a929ddSTejun Heo */ 1621c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1622c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1623c9a929ddSTejun Heo { 1624c9a929ddSTejun Heo struct throtl_data *td = q->td; 16252a12f0dcSTejun Heo struct blkcg_gq *blkg; 1626492eb21bSTejun Heo struct cgroup_subsys_state *pos_css; 1627c9a929ddSTejun Heo struct bio *bio; 1628651930bcSTejun Heo int rw; 1629c9a929ddSTejun Heo 16308bcb6c7dSAndi Kleen queue_lockdep_assert_held(q); 16312a12f0dcSTejun Heo rcu_read_lock(); 1632c9a929ddSTejun Heo 16332a12f0dcSTejun Heo /* 16342a12f0dcSTejun Heo * Drain each tg while doing post-order walk on the blkg tree, so 16352a12f0dcSTejun Heo * that all bios are propagated to td->service_queue. It'd be 16362a12f0dcSTejun Heo * better to walk service_queue tree directly but blkg walk is 16372a12f0dcSTejun Heo * easier. 16382a12f0dcSTejun Heo */ 1639492eb21bSTejun Heo blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) 16402a12f0dcSTejun Heo tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 164173f0d49aSTejun Heo 16422a12f0dcSTejun Heo /* finally, transfer bios from top-level tg's into the td */ 16432a12f0dcSTejun Heo tg_drain_bios(&td->service_queue); 16442a12f0dcSTejun Heo 16452a12f0dcSTejun Heo rcu_read_unlock(); 1646c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1647c9a929ddSTejun Heo 16482a12f0dcSTejun Heo /* all bios now should be in td->service_queue, issue them */ 1649651930bcSTejun Heo for (rw = READ; rw <= WRITE; rw++) 1650c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], 1651c5cc2070STejun Heo NULL))) 1652c9a929ddSTejun Heo generic_make_request(bio); 1653c9a929ddSTejun Heo 1654c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1655c9a929ddSTejun Heo } 1656c9a929ddSTejun Heo 1657e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1658e43473b7SVivek Goyal { 1659e43473b7SVivek Goyal struct throtl_data *td; 1660a2b1693bSTejun Heo int ret; 1661e43473b7SVivek Goyal 1662e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1663e43473b7SVivek Goyal if (!td) 1664e43473b7SVivek Goyal return -ENOMEM; 1665e43473b7SVivek Goyal 166669df0ab0STejun Heo INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 166777216b04STejun Heo throtl_service_queue_init(&td->service_queue, NULL); 1668e43473b7SVivek Goyal 1669cd1604faSTejun Heo q->td = td; 167029b12589SVivek Goyal td->queue = q; 167102977e4aSVivek Goyal 1672a2b1693bSTejun Heo /* activate policy */ 16733c798398STejun Heo ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1674a2b1693bSTejun Heo if (ret) 167529b12589SVivek Goyal kfree(td); 1676a2b1693bSTejun Heo return ret; 1677e43473b7SVivek Goyal } 1678e43473b7SVivek Goyal 1679e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1680e43473b7SVivek Goyal { 1681c875f4d0STejun Heo BUG_ON(!q->td); 1682da527770SVivek Goyal throtl_shutdown_wq(q); 16833c798398STejun Heo blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1684c9a929ddSTejun Heo kfree(q->td); 1685e43473b7SVivek Goyal } 1686e43473b7SVivek Goyal 1687e43473b7SVivek Goyal static int __init throtl_init(void) 1688e43473b7SVivek Goyal { 1689450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1690450adcbeSVivek Goyal if (!kthrotld_workqueue) 1691450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1692450adcbeSVivek Goyal 16933c798398STejun Heo return blkcg_policy_register(&blkcg_policy_throtl); 1694e43473b7SVivek Goyal } 1695e43473b7SVivek Goyal 1696e43473b7SVivek Goyal module_init(throtl_init); 1697