1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12eea8f41cSTejun Heo #include <linux/blk-cgroup.h> 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 243c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal 29c5cc2070STejun Heo /* 30c5cc2070STejun Heo * To implement hierarchical throttling, throtl_grps form a tree and bios 31c5cc2070STejun Heo * are dispatched upwards level by level until they reach the top and get 32c5cc2070STejun Heo * issued. When dispatching bios from the children and local group at each 33c5cc2070STejun Heo * level, if the bios are dispatched into a single bio_list, there's a risk 34c5cc2070STejun Heo * of a local or child group which can queue many bios at once filling up 35c5cc2070STejun Heo * the list starving others. 36c5cc2070STejun Heo * 37c5cc2070STejun Heo * To avoid such starvation, dispatched bios are queued separately 38c5cc2070STejun Heo * according to where they came from. When they are again dispatched to 39c5cc2070STejun Heo * the parent, they're popped in round-robin order so that no single source 40c5cc2070STejun Heo * hogs the dispatch window. 41c5cc2070STejun Heo * 42c5cc2070STejun Heo * throtl_qnode is used to keep the queued bios separated by their sources. 43c5cc2070STejun Heo * Bios are queued to throtl_qnode which in turn is queued to 44c5cc2070STejun Heo * throtl_service_queue and then dispatched in round-robin order. 45c5cc2070STejun Heo * 46c5cc2070STejun Heo * It's also used to track the reference counts on blkg's. A qnode always 47c5cc2070STejun Heo * belongs to a throtl_grp and gets queued on itself or the parent, so 48c5cc2070STejun Heo * incrementing the reference of the associated throtl_grp when a qnode is 49c5cc2070STejun Heo * queued and decrementing when dequeued is enough to keep the whole blkg 50c5cc2070STejun Heo * tree pinned while bios are in flight. 51c5cc2070STejun Heo */ 52c5cc2070STejun Heo struct throtl_qnode { 53c5cc2070STejun Heo struct list_head node; /* service_queue->queued[] */ 54c5cc2070STejun Heo struct bio_list bios; /* queued bios */ 55c5cc2070STejun Heo struct throtl_grp *tg; /* tg this qnode belongs to */ 56c5cc2070STejun Heo }; 57c5cc2070STejun Heo 58c9e0332eSTejun Heo struct throtl_service_queue { 5977216b04STejun Heo struct throtl_service_queue *parent_sq; /* the parent service_queue */ 6077216b04STejun Heo 6173f0d49aSTejun Heo /* 6273f0d49aSTejun Heo * Bios queued directly to this service_queue or dispatched from 6373f0d49aSTejun Heo * children throtl_grp's. 6473f0d49aSTejun Heo */ 65c5cc2070STejun Heo struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 6673f0d49aSTejun Heo unsigned int nr_queued[2]; /* number of queued bios */ 6773f0d49aSTejun Heo 6873f0d49aSTejun Heo /* 6973f0d49aSTejun Heo * RB tree of active children throtl_grp's, which are sorted by 7073f0d49aSTejun Heo * their ->disptime. 7173f0d49aSTejun Heo */ 72c9e0332eSTejun Heo struct rb_root pending_tree; /* RB tree of active tgs */ 73c9e0332eSTejun Heo struct rb_node *first_pending; /* first node in the tree */ 74c9e0332eSTejun Heo unsigned int nr_pending; /* # queued in the tree */ 75c9e0332eSTejun Heo unsigned long first_pending_disptime; /* disptime of the first tg */ 7669df0ab0STejun Heo struct timer_list pending_timer; /* fires on first_pending_disptime */ 77e43473b7SVivek Goyal }; 78e43473b7SVivek Goyal 795b2c16aaSTejun Heo enum tg_state_flags { 805b2c16aaSTejun Heo THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 810e9f4164STejun Heo THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 825b2c16aaSTejun Heo }; 835b2c16aaSTejun Heo 84e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 85e43473b7SVivek Goyal 86e43473b7SVivek Goyal struct throtl_grp { 87f95a04afSTejun Heo /* must be the first member */ 88f95a04afSTejun Heo struct blkg_policy_data pd; 89f95a04afSTejun Heo 90c9e0332eSTejun Heo /* active throtl group service_queue member */ 91e43473b7SVivek Goyal struct rb_node rb_node; 92e43473b7SVivek Goyal 930f3457f6STejun Heo /* throtl_data this group belongs to */ 940f3457f6STejun Heo struct throtl_data *td; 950f3457f6STejun Heo 9649a2f1e3STejun Heo /* this group's service queue */ 9749a2f1e3STejun Heo struct throtl_service_queue service_queue; 9849a2f1e3STejun Heo 99e43473b7SVivek Goyal /* 100c5cc2070STejun Heo * qnode_on_self is used when bios are directly queued to this 101c5cc2070STejun Heo * throtl_grp so that local bios compete fairly with bios 102c5cc2070STejun Heo * dispatched from children. qnode_on_parent is used when bios are 103c5cc2070STejun Heo * dispatched from this throtl_grp into its parent and will compete 104c5cc2070STejun Heo * with the sibling qnode_on_parents and the parent's 105c5cc2070STejun Heo * qnode_on_self. 106c5cc2070STejun Heo */ 107c5cc2070STejun Heo struct throtl_qnode qnode_on_self[2]; 108c5cc2070STejun Heo struct throtl_qnode qnode_on_parent[2]; 109c5cc2070STejun Heo 110c5cc2070STejun Heo /* 111e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 112e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 113e43473b7SVivek Goyal * key to sort active groups in service tree. 114e43473b7SVivek Goyal */ 115e43473b7SVivek Goyal unsigned long disptime; 116e43473b7SVivek Goyal 117e43473b7SVivek Goyal unsigned int flags; 118e43473b7SVivek Goyal 119693e751eSTejun Heo /* are there any throtl rules between this group and td? */ 120693e751eSTejun Heo bool has_rules[2]; 121693e751eSTejun Heo 122e43473b7SVivek Goyal /* bytes per second rate limits */ 123e43473b7SVivek Goyal uint64_t bps[2]; 124e43473b7SVivek Goyal 1258e89d13fSVivek Goyal /* IOPS limits */ 1268e89d13fSVivek Goyal unsigned int iops[2]; 1278e89d13fSVivek Goyal 128e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 129e43473b7SVivek Goyal uint64_t bytes_disp[2]; 1308e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 1318e89d13fSVivek Goyal unsigned int io_disp[2]; 132e43473b7SVivek Goyal 133e43473b7SVivek Goyal /* When did we start a new slice */ 134e43473b7SVivek Goyal unsigned long slice_start[2]; 135e43473b7SVivek Goyal unsigned long slice_end[2]; 136e43473b7SVivek Goyal }; 137e43473b7SVivek Goyal 138e43473b7SVivek Goyal struct throtl_data 139e43473b7SVivek Goyal { 140e43473b7SVivek Goyal /* service tree for active throtl groups */ 141c9e0332eSTejun Heo struct throtl_service_queue service_queue; 142e43473b7SVivek Goyal 143e43473b7SVivek Goyal struct request_queue *queue; 144e43473b7SVivek Goyal 145e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 146e43473b7SVivek Goyal unsigned int nr_queued[2]; 147e43473b7SVivek Goyal 148e43473b7SVivek Goyal /* 14902977e4aSVivek Goyal * number of total undestroyed groups 150e43473b7SVivek Goyal */ 151e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 152e43473b7SVivek Goyal 153e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 15469df0ab0STejun Heo struct work_struct dispatch_work; 155e43473b7SVivek Goyal }; 156e43473b7SVivek Goyal 15769df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg); 15869df0ab0STejun Heo 159f95a04afSTejun Heo static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 160f95a04afSTejun Heo { 161f95a04afSTejun Heo return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 162f95a04afSTejun Heo } 163f95a04afSTejun Heo 1643c798398STejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 1650381411eSTejun Heo { 166f95a04afSTejun Heo return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 1670381411eSTejun Heo } 1680381411eSTejun Heo 1693c798398STejun Heo static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 1700381411eSTejun Heo { 171f95a04afSTejun Heo return pd_to_blkg(&tg->pd); 1720381411eSTejun Heo } 1730381411eSTejun Heo 174fda6f272STejun Heo /** 175fda6f272STejun Heo * sq_to_tg - return the throl_grp the specified service queue belongs to 176fda6f272STejun Heo * @sq: the throtl_service_queue of interest 177fda6f272STejun Heo * 178fda6f272STejun Heo * Return the throtl_grp @sq belongs to. If @sq is the top-level one 179fda6f272STejun Heo * embedded in throtl_data, %NULL is returned. 180fda6f272STejun Heo */ 181fda6f272STejun Heo static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 182fda6f272STejun Heo { 183fda6f272STejun Heo if (sq && sq->parent_sq) 184fda6f272STejun Heo return container_of(sq, struct throtl_grp, service_queue); 185fda6f272STejun Heo else 186fda6f272STejun Heo return NULL; 187fda6f272STejun Heo } 188fda6f272STejun Heo 189fda6f272STejun Heo /** 190fda6f272STejun Heo * sq_to_td - return throtl_data the specified service queue belongs to 191fda6f272STejun Heo * @sq: the throtl_service_queue of interest 192fda6f272STejun Heo * 193fda6f272STejun Heo * A service_queue can be embeded in either a throtl_grp or throtl_data. 194fda6f272STejun Heo * Determine the associated throtl_data accordingly and return it. 195fda6f272STejun Heo */ 196fda6f272STejun Heo static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 197fda6f272STejun Heo { 198fda6f272STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 199fda6f272STejun Heo 200fda6f272STejun Heo if (tg) 201fda6f272STejun Heo return tg->td; 202fda6f272STejun Heo else 203fda6f272STejun Heo return container_of(sq, struct throtl_data, service_queue); 204fda6f272STejun Heo } 205fda6f272STejun Heo 206fda6f272STejun Heo /** 207fda6f272STejun Heo * throtl_log - log debug message via blktrace 208fda6f272STejun Heo * @sq: the service_queue being reported 209fda6f272STejun Heo * @fmt: printf format string 210fda6f272STejun Heo * @args: printf args 211fda6f272STejun Heo * 212fda6f272STejun Heo * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 213fda6f272STejun Heo * throtl_grp; otherwise, just "throtl". 214fda6f272STejun Heo * 215fda6f272STejun Heo * TODO: this should be made a function and name formatting should happen 216fda6f272STejun Heo * after testing whether blktrace is enabled. 217fda6f272STejun Heo */ 218fda6f272STejun Heo #define throtl_log(sq, fmt, args...) do { \ 219fda6f272STejun Heo struct throtl_grp *__tg = sq_to_tg((sq)); \ 220fda6f272STejun Heo struct throtl_data *__td = sq_to_td((sq)); \ 221fda6f272STejun Heo \ 222fda6f272STejun Heo (void)__td; \ 223fda6f272STejun Heo if ((__tg)) { \ 22454e7ed12STejun Heo char __pbuf[128]; \ 22554e7ed12STejun Heo \ 226fda6f272STejun Heo blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ 227fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ 228fda6f272STejun Heo } else { \ 229fda6f272STejun Heo blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 230fda6f272STejun Heo } \ 23154e7ed12STejun Heo } while (0) 232e43473b7SVivek Goyal 233c5cc2070STejun Heo static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 234c5cc2070STejun Heo { 235c5cc2070STejun Heo INIT_LIST_HEAD(&qn->node); 236c5cc2070STejun Heo bio_list_init(&qn->bios); 237c5cc2070STejun Heo qn->tg = tg; 238c5cc2070STejun Heo } 239c5cc2070STejun Heo 240c5cc2070STejun Heo /** 241c5cc2070STejun Heo * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 242c5cc2070STejun Heo * @bio: bio being added 243c5cc2070STejun Heo * @qn: qnode to add bio to 244c5cc2070STejun Heo * @queued: the service_queue->queued[] list @qn belongs to 245c5cc2070STejun Heo * 246c5cc2070STejun Heo * Add @bio to @qn and put @qn on @queued if it's not already on. 247c5cc2070STejun Heo * @qn->tg's reference count is bumped when @qn is activated. See the 248c5cc2070STejun Heo * comment on top of throtl_qnode definition for details. 249c5cc2070STejun Heo */ 250c5cc2070STejun Heo static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 251c5cc2070STejun Heo struct list_head *queued) 252c5cc2070STejun Heo { 253c5cc2070STejun Heo bio_list_add(&qn->bios, bio); 254c5cc2070STejun Heo if (list_empty(&qn->node)) { 255c5cc2070STejun Heo list_add_tail(&qn->node, queued); 256c5cc2070STejun Heo blkg_get(tg_to_blkg(qn->tg)); 257c5cc2070STejun Heo } 258c5cc2070STejun Heo } 259c5cc2070STejun Heo 260c5cc2070STejun Heo /** 261c5cc2070STejun Heo * throtl_peek_queued - peek the first bio on a qnode list 262c5cc2070STejun Heo * @queued: the qnode list to peek 263c5cc2070STejun Heo */ 264c5cc2070STejun Heo static struct bio *throtl_peek_queued(struct list_head *queued) 265c5cc2070STejun Heo { 266c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 267c5cc2070STejun Heo struct bio *bio; 268c5cc2070STejun Heo 269c5cc2070STejun Heo if (list_empty(queued)) 270c5cc2070STejun Heo return NULL; 271c5cc2070STejun Heo 272c5cc2070STejun Heo bio = bio_list_peek(&qn->bios); 273c5cc2070STejun Heo WARN_ON_ONCE(!bio); 274c5cc2070STejun Heo return bio; 275c5cc2070STejun Heo } 276c5cc2070STejun Heo 277c5cc2070STejun Heo /** 278c5cc2070STejun Heo * throtl_pop_queued - pop the first bio form a qnode list 279c5cc2070STejun Heo * @queued: the qnode list to pop a bio from 280c5cc2070STejun Heo * @tg_to_put: optional out argument for throtl_grp to put 281c5cc2070STejun Heo * 282c5cc2070STejun Heo * Pop the first bio from the qnode list @queued. After popping, the first 283c5cc2070STejun Heo * qnode is removed from @queued if empty or moved to the end of @queued so 284c5cc2070STejun Heo * that the popping order is round-robin. 285c5cc2070STejun Heo * 286c5cc2070STejun Heo * When the first qnode is removed, its associated throtl_grp should be put 287c5cc2070STejun Heo * too. If @tg_to_put is NULL, this function automatically puts it; 288c5cc2070STejun Heo * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 289c5cc2070STejun Heo * responsible for putting it. 290c5cc2070STejun Heo */ 291c5cc2070STejun Heo static struct bio *throtl_pop_queued(struct list_head *queued, 292c5cc2070STejun Heo struct throtl_grp **tg_to_put) 293c5cc2070STejun Heo { 294c5cc2070STejun Heo struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 295c5cc2070STejun Heo struct bio *bio; 296c5cc2070STejun Heo 297c5cc2070STejun Heo if (list_empty(queued)) 298c5cc2070STejun Heo return NULL; 299c5cc2070STejun Heo 300c5cc2070STejun Heo bio = bio_list_pop(&qn->bios); 301c5cc2070STejun Heo WARN_ON_ONCE(!bio); 302c5cc2070STejun Heo 303c5cc2070STejun Heo if (bio_list_empty(&qn->bios)) { 304c5cc2070STejun Heo list_del_init(&qn->node); 305c5cc2070STejun Heo if (tg_to_put) 306c5cc2070STejun Heo *tg_to_put = qn->tg; 307c5cc2070STejun Heo else 308c5cc2070STejun Heo blkg_put(tg_to_blkg(qn->tg)); 309c5cc2070STejun Heo } else { 310c5cc2070STejun Heo list_move_tail(&qn->node, queued); 311c5cc2070STejun Heo } 312c5cc2070STejun Heo 313c5cc2070STejun Heo return bio; 314c5cc2070STejun Heo } 315c5cc2070STejun Heo 31649a2f1e3STejun Heo /* init a service_queue, assumes the caller zeroed it */ 317b2ce2643STejun Heo static void throtl_service_queue_init(struct throtl_service_queue *sq) 31849a2f1e3STejun Heo { 319c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[0]); 320c5cc2070STejun Heo INIT_LIST_HEAD(&sq->queued[1]); 32149a2f1e3STejun Heo sq->pending_tree = RB_ROOT; 32269df0ab0STejun Heo setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 32369df0ab0STejun Heo (unsigned long)sq); 32469df0ab0STejun Heo } 32569df0ab0STejun Heo 326001bea73STejun Heo static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) 327001bea73STejun Heo { 3284fb72036STejun Heo struct throtl_grp *tg; 32924bdb8efSTejun Heo int rw; 3304fb72036STejun Heo 3314fb72036STejun Heo tg = kzalloc_node(sizeof(*tg), gfp, node); 3324fb72036STejun Heo if (!tg) 33377ea7338STejun Heo return NULL; 3344fb72036STejun Heo 335b2ce2643STejun Heo throtl_service_queue_init(&tg->service_queue); 336b2ce2643STejun Heo 337b2ce2643STejun Heo for (rw = READ; rw <= WRITE; rw++) { 338b2ce2643STejun Heo throtl_qnode_init(&tg->qnode_on_self[rw], tg); 339b2ce2643STejun Heo throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 340b2ce2643STejun Heo } 341b2ce2643STejun Heo 342b2ce2643STejun Heo RB_CLEAR_NODE(&tg->rb_node); 343b2ce2643STejun Heo tg->bps[READ] = -1; 344b2ce2643STejun Heo tg->bps[WRITE] = -1; 345b2ce2643STejun Heo tg->iops[READ] = -1; 346b2ce2643STejun Heo tg->iops[WRITE] = -1; 347b2ce2643STejun Heo 3484fb72036STejun Heo return &tg->pd; 349001bea73STejun Heo } 350001bea73STejun Heo 351a9520cd6STejun Heo static void throtl_pd_init(struct blkg_policy_data *pd) 352a29a171eSVivek Goyal { 353a9520cd6STejun Heo struct throtl_grp *tg = pd_to_tg(pd); 354a9520cd6STejun Heo struct blkcg_gq *blkg = tg_to_blkg(tg); 35577216b04STejun Heo struct throtl_data *td = blkg->q->td; 356b2ce2643STejun Heo struct throtl_service_queue *sq = &tg->service_queue; 357cd1604faSTejun Heo 3589138125bSTejun Heo /* 359aa6ec29bSTejun Heo * If on the default hierarchy, we switch to properly hierarchical 3609138125bSTejun Heo * behavior where limits on a given throtl_grp are applied to the 3619138125bSTejun Heo * whole subtree rather than just the group itself. e.g. If 16M 3629138125bSTejun Heo * read_bps limit is set on the root group, the whole system can't 3639138125bSTejun Heo * exceed 16M for the device. 3649138125bSTejun Heo * 365aa6ec29bSTejun Heo * If not on the default hierarchy, the broken flat hierarchy 3669138125bSTejun Heo * behavior is retained where all throtl_grps are treated as if 3679138125bSTejun Heo * they're all separate root groups right below throtl_data. 3689138125bSTejun Heo * Limits of a group don't interact with limits of other groups 3699138125bSTejun Heo * regardless of the position of the group in the hierarchy. 3709138125bSTejun Heo */ 371b2ce2643STejun Heo sq->parent_sq = &td->service_queue; 372aa6ec29bSTejun Heo if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) 373b2ce2643STejun Heo sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 37477216b04STejun Heo tg->td = td; 3758a3d2615STejun Heo } 3768a3d2615STejun Heo 377693e751eSTejun Heo /* 378693e751eSTejun Heo * Set has_rules[] if @tg or any of its parents have limits configured. 379693e751eSTejun Heo * This doesn't require walking up to the top of the hierarchy as the 380693e751eSTejun Heo * parent's has_rules[] is guaranteed to be correct. 381693e751eSTejun Heo */ 382693e751eSTejun Heo static void tg_update_has_rules(struct throtl_grp *tg) 383693e751eSTejun Heo { 384693e751eSTejun Heo struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 385693e751eSTejun Heo int rw; 386693e751eSTejun Heo 387693e751eSTejun Heo for (rw = READ; rw <= WRITE; rw++) 388693e751eSTejun Heo tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 389693e751eSTejun Heo (tg->bps[rw] != -1 || tg->iops[rw] != -1); 390693e751eSTejun Heo } 391693e751eSTejun Heo 392a9520cd6STejun Heo static void throtl_pd_online(struct blkg_policy_data *pd) 393693e751eSTejun Heo { 394693e751eSTejun Heo /* 395693e751eSTejun Heo * We don't want new groups to escape the limits of its ancestors. 396693e751eSTejun Heo * Update has_rules[] after a new group is brought online. 397693e751eSTejun Heo */ 398a9520cd6STejun Heo tg_update_has_rules(pd_to_tg(pd)); 399693e751eSTejun Heo } 400693e751eSTejun Heo 401001bea73STejun Heo static void throtl_pd_free(struct blkg_policy_data *pd) 402001bea73STejun Heo { 4034fb72036STejun Heo struct throtl_grp *tg = pd_to_tg(pd); 4044fb72036STejun Heo 405b2ce2643STejun Heo del_timer_sync(&tg->service_queue.pending_timer); 4064fb72036STejun Heo kfree(tg); 407001bea73STejun Heo } 408001bea73STejun Heo 4090049af73STejun Heo static struct throtl_grp * 4100049af73STejun Heo throtl_rb_first(struct throtl_service_queue *parent_sq) 411e43473b7SVivek Goyal { 412e43473b7SVivek Goyal /* Service tree is empty */ 4130049af73STejun Heo if (!parent_sq->nr_pending) 414e43473b7SVivek Goyal return NULL; 415e43473b7SVivek Goyal 4160049af73STejun Heo if (!parent_sq->first_pending) 4170049af73STejun Heo parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 418e43473b7SVivek Goyal 4190049af73STejun Heo if (parent_sq->first_pending) 4200049af73STejun Heo return rb_entry_tg(parent_sq->first_pending); 421e43473b7SVivek Goyal 422e43473b7SVivek Goyal return NULL; 423e43473b7SVivek Goyal } 424e43473b7SVivek Goyal 425e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 426e43473b7SVivek Goyal { 427e43473b7SVivek Goyal rb_erase(n, root); 428e43473b7SVivek Goyal RB_CLEAR_NODE(n); 429e43473b7SVivek Goyal } 430e43473b7SVivek Goyal 4310049af73STejun Heo static void throtl_rb_erase(struct rb_node *n, 4320049af73STejun Heo struct throtl_service_queue *parent_sq) 433e43473b7SVivek Goyal { 4340049af73STejun Heo if (parent_sq->first_pending == n) 4350049af73STejun Heo parent_sq->first_pending = NULL; 4360049af73STejun Heo rb_erase_init(n, &parent_sq->pending_tree); 4370049af73STejun Heo --parent_sq->nr_pending; 438e43473b7SVivek Goyal } 439e43473b7SVivek Goyal 4400049af73STejun Heo static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 441e43473b7SVivek Goyal { 442e43473b7SVivek Goyal struct throtl_grp *tg; 443e43473b7SVivek Goyal 4440049af73STejun Heo tg = throtl_rb_first(parent_sq); 445e43473b7SVivek Goyal if (!tg) 446e43473b7SVivek Goyal return; 447e43473b7SVivek Goyal 4480049af73STejun Heo parent_sq->first_pending_disptime = tg->disptime; 449e43473b7SVivek Goyal } 450e43473b7SVivek Goyal 45177216b04STejun Heo static void tg_service_queue_add(struct throtl_grp *tg) 452e43473b7SVivek Goyal { 45377216b04STejun Heo struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 4540049af73STejun Heo struct rb_node **node = &parent_sq->pending_tree.rb_node; 455e43473b7SVivek Goyal struct rb_node *parent = NULL; 456e43473b7SVivek Goyal struct throtl_grp *__tg; 457e43473b7SVivek Goyal unsigned long key = tg->disptime; 458e43473b7SVivek Goyal int left = 1; 459e43473b7SVivek Goyal 460e43473b7SVivek Goyal while (*node != NULL) { 461e43473b7SVivek Goyal parent = *node; 462e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 463e43473b7SVivek Goyal 464e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 465e43473b7SVivek Goyal node = &parent->rb_left; 466e43473b7SVivek Goyal else { 467e43473b7SVivek Goyal node = &parent->rb_right; 468e43473b7SVivek Goyal left = 0; 469e43473b7SVivek Goyal } 470e43473b7SVivek Goyal } 471e43473b7SVivek Goyal 472e43473b7SVivek Goyal if (left) 4730049af73STejun Heo parent_sq->first_pending = &tg->rb_node; 474e43473b7SVivek Goyal 475e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 4760049af73STejun Heo rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 477e43473b7SVivek Goyal } 478e43473b7SVivek Goyal 47977216b04STejun Heo static void __throtl_enqueue_tg(struct throtl_grp *tg) 480e43473b7SVivek Goyal { 48177216b04STejun Heo tg_service_queue_add(tg); 4825b2c16aaSTejun Heo tg->flags |= THROTL_TG_PENDING; 48377216b04STejun Heo tg->service_queue.parent_sq->nr_pending++; 484e43473b7SVivek Goyal } 485e43473b7SVivek Goyal 48677216b04STejun Heo static void throtl_enqueue_tg(struct throtl_grp *tg) 487e43473b7SVivek Goyal { 4885b2c16aaSTejun Heo if (!(tg->flags & THROTL_TG_PENDING)) 48977216b04STejun Heo __throtl_enqueue_tg(tg); 490e43473b7SVivek Goyal } 491e43473b7SVivek Goyal 49277216b04STejun Heo static void __throtl_dequeue_tg(struct throtl_grp *tg) 493e43473b7SVivek Goyal { 49477216b04STejun Heo throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 4955b2c16aaSTejun Heo tg->flags &= ~THROTL_TG_PENDING; 496e43473b7SVivek Goyal } 497e43473b7SVivek Goyal 49877216b04STejun Heo static void throtl_dequeue_tg(struct throtl_grp *tg) 499e43473b7SVivek Goyal { 5005b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) 50177216b04STejun Heo __throtl_dequeue_tg(tg); 502e43473b7SVivek Goyal } 503e43473b7SVivek Goyal 504a9131a27STejun Heo /* Call with queue lock held */ 50569df0ab0STejun Heo static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 50669df0ab0STejun Heo unsigned long expires) 507a9131a27STejun Heo { 50869df0ab0STejun Heo mod_timer(&sq->pending_timer, expires); 50969df0ab0STejun Heo throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 51069df0ab0STejun Heo expires - jiffies, jiffies); 511a9131a27STejun Heo } 512a9131a27STejun Heo 5137f52f98cSTejun Heo /** 5147f52f98cSTejun Heo * throtl_schedule_next_dispatch - schedule the next dispatch cycle 5157f52f98cSTejun Heo * @sq: the service_queue to schedule dispatch for 5167f52f98cSTejun Heo * @force: force scheduling 5177f52f98cSTejun Heo * 5187f52f98cSTejun Heo * Arm @sq->pending_timer so that the next dispatch cycle starts on the 5197f52f98cSTejun Heo * dispatch time of the first pending child. Returns %true if either timer 5207f52f98cSTejun Heo * is armed or there's no pending child left. %false if the current 5217f52f98cSTejun Heo * dispatch window is still open and the caller should continue 5227f52f98cSTejun Heo * dispatching. 5237f52f98cSTejun Heo * 5247f52f98cSTejun Heo * If @force is %true, the dispatch timer is always scheduled and this 5257f52f98cSTejun Heo * function is guaranteed to return %true. This is to be used when the 5267f52f98cSTejun Heo * caller can't dispatch itself and needs to invoke pending_timer 5277f52f98cSTejun Heo * unconditionally. Note that forced scheduling is likely to induce short 5287f52f98cSTejun Heo * delay before dispatch starts even if @sq->first_pending_disptime is not 5297f52f98cSTejun Heo * in the future and thus shouldn't be used in hot paths. 5307f52f98cSTejun Heo */ 5317f52f98cSTejun Heo static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 5327f52f98cSTejun Heo bool force) 533e43473b7SVivek Goyal { 5346a525600STejun Heo /* any pending children left? */ 535c9e0332eSTejun Heo if (!sq->nr_pending) 5367f52f98cSTejun Heo return true; 537e43473b7SVivek Goyal 538c9e0332eSTejun Heo update_min_dispatch_time(sq); 539e43473b7SVivek Goyal 54069df0ab0STejun Heo /* is the next dispatch time in the future? */ 5417f52f98cSTejun Heo if (force || time_after(sq->first_pending_disptime, jiffies)) { 54269df0ab0STejun Heo throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 5437f52f98cSTejun Heo return true; 54469df0ab0STejun Heo } 54569df0ab0STejun Heo 5467f52f98cSTejun Heo /* tell the caller to continue dispatching */ 5477f52f98cSTejun Heo return false; 548e43473b7SVivek Goyal } 549e43473b7SVivek Goyal 55032ee5bc4SVivek Goyal static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 55132ee5bc4SVivek Goyal bool rw, unsigned long start) 55232ee5bc4SVivek Goyal { 55332ee5bc4SVivek Goyal tg->bytes_disp[rw] = 0; 55432ee5bc4SVivek Goyal tg->io_disp[rw] = 0; 55532ee5bc4SVivek Goyal 55632ee5bc4SVivek Goyal /* 55732ee5bc4SVivek Goyal * Previous slice has expired. We must have trimmed it after last 55832ee5bc4SVivek Goyal * bio dispatch. That means since start of last slice, we never used 55932ee5bc4SVivek Goyal * that bandwidth. Do try to make use of that bandwidth while giving 56032ee5bc4SVivek Goyal * credit. 56132ee5bc4SVivek Goyal */ 56232ee5bc4SVivek Goyal if (time_after_eq(start, tg->slice_start[rw])) 56332ee5bc4SVivek Goyal tg->slice_start[rw] = start; 56432ee5bc4SVivek Goyal 56532ee5bc4SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 56632ee5bc4SVivek Goyal throtl_log(&tg->service_queue, 56732ee5bc4SVivek Goyal "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 56832ee5bc4SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 56932ee5bc4SVivek Goyal tg->slice_end[rw], jiffies); 57032ee5bc4SVivek Goyal } 57132ee5bc4SVivek Goyal 5720f3457f6STejun Heo static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 573e43473b7SVivek Goyal { 574e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 5758e89d13fSVivek Goyal tg->io_disp[rw] = 0; 576e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 577e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 578fda6f272STejun Heo throtl_log(&tg->service_queue, 579fda6f272STejun Heo "[%c] new slice start=%lu end=%lu jiffies=%lu", 580e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 581e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 582e43473b7SVivek Goyal } 583e43473b7SVivek Goyal 5840f3457f6STejun Heo static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 5850f3457f6STejun Heo unsigned long jiffy_end) 586d1ae8ffdSVivek Goyal { 587d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 588d1ae8ffdSVivek Goyal } 589d1ae8ffdSVivek Goyal 5900f3457f6STejun Heo static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 5910f3457f6STejun Heo unsigned long jiffy_end) 592e43473b7SVivek Goyal { 593e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 594fda6f272STejun Heo throtl_log(&tg->service_queue, 595fda6f272STejun Heo "[%c] extend slice start=%lu end=%lu jiffies=%lu", 596e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 597e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 598e43473b7SVivek Goyal } 599e43473b7SVivek Goyal 600e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 6010f3457f6STejun Heo static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 602e43473b7SVivek Goyal { 603e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 6045cf8c227SFabian Frederick return false; 605e43473b7SVivek Goyal 606e43473b7SVivek Goyal return 1; 607e43473b7SVivek Goyal } 608e43473b7SVivek Goyal 609e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 6100f3457f6STejun Heo static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 611e43473b7SVivek Goyal { 6123aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 6133aad5d3eSVivek Goyal u64 bytes_trim, tmp; 614e43473b7SVivek Goyal 615e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 616e43473b7SVivek Goyal 617e43473b7SVivek Goyal /* 618e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 619e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 620e43473b7SVivek Goyal * slice will start when appropriate. 621e43473b7SVivek Goyal */ 6220f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 623e43473b7SVivek Goyal return; 624e43473b7SVivek Goyal 625d1ae8ffdSVivek Goyal /* 626d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 627d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 628d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 629d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 630d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 631d1ae8ffdSVivek Goyal */ 632d1ae8ffdSVivek Goyal 6330f3457f6STejun Heo throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 634d1ae8ffdSVivek Goyal 635e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 636e43473b7SVivek Goyal 637e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 638e43473b7SVivek Goyal 639e43473b7SVivek Goyal if (!nr_slices) 640e43473b7SVivek Goyal return; 6413aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 6423aad5d3eSVivek Goyal do_div(tmp, HZ); 6433aad5d3eSVivek Goyal bytes_trim = tmp; 644e43473b7SVivek Goyal 6458e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 646e43473b7SVivek Goyal 6478e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 648e43473b7SVivek Goyal return; 649e43473b7SVivek Goyal 650e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 651e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 652e43473b7SVivek Goyal else 653e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 654e43473b7SVivek Goyal 6558e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 6568e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 6578e89d13fSVivek Goyal else 6588e89d13fSVivek Goyal tg->io_disp[rw] = 0; 6598e89d13fSVivek Goyal 660e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 661e43473b7SVivek Goyal 662fda6f272STejun Heo throtl_log(&tg->service_queue, 663fda6f272STejun Heo "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 6648e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 665e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 666e43473b7SVivek Goyal } 667e43473b7SVivek Goyal 6680f3457f6STejun Heo static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 6690f3457f6STejun Heo unsigned long *wait) 670e43473b7SVivek Goyal { 671e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 6728e89d13fSVivek Goyal unsigned int io_allowed; 673e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 674c49c06e4SVivek Goyal u64 tmp; 675e43473b7SVivek Goyal 6768e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 677e43473b7SVivek Goyal 6788e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 6798e89d13fSVivek Goyal if (!jiffy_elapsed) 6808e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 6818e89d13fSVivek Goyal 6828e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 6838e89d13fSVivek Goyal 684c49c06e4SVivek Goyal /* 685c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 686c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 687c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 688c49c06e4SVivek Goyal * have been trimmed. 689c49c06e4SVivek Goyal */ 690c49c06e4SVivek Goyal 691c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 692c49c06e4SVivek Goyal do_div(tmp, HZ); 693c49c06e4SVivek Goyal 694c49c06e4SVivek Goyal if (tmp > UINT_MAX) 695c49c06e4SVivek Goyal io_allowed = UINT_MAX; 696c49c06e4SVivek Goyal else 697c49c06e4SVivek Goyal io_allowed = tmp; 6988e89d13fSVivek Goyal 6998e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 700e43473b7SVivek Goyal if (wait) 701e43473b7SVivek Goyal *wait = 0; 7025cf8c227SFabian Frederick return true; 703e43473b7SVivek Goyal } 704e43473b7SVivek Goyal 7058e89d13fSVivek Goyal /* Calc approx time to dispatch */ 7068e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 7078e89d13fSVivek Goyal 7088e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 7098e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 7108e89d13fSVivek Goyal else 7118e89d13fSVivek Goyal jiffy_wait = 1; 7128e89d13fSVivek Goyal 7138e89d13fSVivek Goyal if (wait) 7148e89d13fSVivek Goyal *wait = jiffy_wait; 7158e89d13fSVivek Goyal return 0; 716e43473b7SVivek Goyal } 717e43473b7SVivek Goyal 7180f3457f6STejun Heo static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 7190f3457f6STejun Heo unsigned long *wait) 7208e89d13fSVivek Goyal { 7218e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 7223aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 7238e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 7248e89d13fSVivek Goyal 725e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 726e43473b7SVivek Goyal 727e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 728e43473b7SVivek Goyal if (!jiffy_elapsed) 729e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 730e43473b7SVivek Goyal 731e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 732e43473b7SVivek Goyal 7335e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 7345e901a2bSVivek Goyal do_div(tmp, HZ); 7353aad5d3eSVivek Goyal bytes_allowed = tmp; 736e43473b7SVivek Goyal 7374f024f37SKent Overstreet if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 738e43473b7SVivek Goyal if (wait) 739e43473b7SVivek Goyal *wait = 0; 7405cf8c227SFabian Frederick return true; 741e43473b7SVivek Goyal } 742e43473b7SVivek Goyal 743e43473b7SVivek Goyal /* Calc approx time to dispatch */ 7444f024f37SKent Overstreet extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; 745e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 746e43473b7SVivek Goyal 747e43473b7SVivek Goyal if (!jiffy_wait) 748e43473b7SVivek Goyal jiffy_wait = 1; 749e43473b7SVivek Goyal 750e43473b7SVivek Goyal /* 751e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 752e43473b7SVivek Goyal * up we did. Add that time also. 753e43473b7SVivek Goyal */ 754e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 755e43473b7SVivek Goyal if (wait) 756e43473b7SVivek Goyal *wait = jiffy_wait; 7578e89d13fSVivek Goyal return 0; 7588e89d13fSVivek Goyal } 759e43473b7SVivek Goyal 7608e89d13fSVivek Goyal /* 7618e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 7628e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 7638e89d13fSVivek Goyal */ 7640f3457f6STejun Heo static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 7650f3457f6STejun Heo unsigned long *wait) 7668e89d13fSVivek Goyal { 7678e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 7688e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 7698e89d13fSVivek Goyal 7708e89d13fSVivek Goyal /* 7718e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 7728e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 7738e89d13fSVivek Goyal * this function with a different bio if there are other bios 7748e89d13fSVivek Goyal * queued. 7758e89d13fSVivek Goyal */ 77673f0d49aSTejun Heo BUG_ON(tg->service_queue.nr_queued[rw] && 777c5cc2070STejun Heo bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 7788e89d13fSVivek Goyal 7798e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 7808e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 7818e89d13fSVivek Goyal if (wait) 7828e89d13fSVivek Goyal *wait = 0; 7835cf8c227SFabian Frederick return true; 7848e89d13fSVivek Goyal } 7858e89d13fSVivek Goyal 7868e89d13fSVivek Goyal /* 7878e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 7888e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 7898e89d13fSVivek Goyal * long since now. 7908e89d13fSVivek Goyal */ 7910f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 7920f3457f6STejun Heo throtl_start_new_slice(tg, rw); 7938e89d13fSVivek Goyal else { 7948e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 7950f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + throtl_slice); 7968e89d13fSVivek Goyal } 7978e89d13fSVivek Goyal 7980f3457f6STejun Heo if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 7990f3457f6STejun Heo tg_with_in_iops_limit(tg, bio, &iops_wait)) { 8008e89d13fSVivek Goyal if (wait) 8018e89d13fSVivek Goyal *wait = 0; 8028e89d13fSVivek Goyal return 1; 8038e89d13fSVivek Goyal } 8048e89d13fSVivek Goyal 8058e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 8068e89d13fSVivek Goyal 8078e89d13fSVivek Goyal if (wait) 8088e89d13fSVivek Goyal *wait = max_wait; 8098e89d13fSVivek Goyal 8108e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 8110f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + max_wait); 812e43473b7SVivek Goyal 813e43473b7SVivek Goyal return 0; 814e43473b7SVivek Goyal } 815e43473b7SVivek Goyal 816e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 817e43473b7SVivek Goyal { 818e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 819e43473b7SVivek Goyal 820e43473b7SVivek Goyal /* Charge the bio to the group */ 8214f024f37SKent Overstreet tg->bytes_disp[rw] += bio->bi_iter.bi_size; 8228e89d13fSVivek Goyal tg->io_disp[rw]++; 823e43473b7SVivek Goyal 8242a0f61e6STejun Heo /* 8252a0f61e6STejun Heo * REQ_THROTTLED is used to prevent the same bio to be throttled 8262a0f61e6STejun Heo * more than once as a throttled bio will go through blk-throtl the 8272a0f61e6STejun Heo * second time when it eventually gets issued. Set it when a bio 8282a0f61e6STejun Heo * is being charged to a tg. 8292a0f61e6STejun Heo */ 83077ea7338STejun Heo if (!(bio->bi_rw & REQ_THROTTLED)) 8312a0f61e6STejun Heo bio->bi_rw |= REQ_THROTTLED; 832e43473b7SVivek Goyal } 833e43473b7SVivek Goyal 834c5cc2070STejun Heo /** 835c5cc2070STejun Heo * throtl_add_bio_tg - add a bio to the specified throtl_grp 836c5cc2070STejun Heo * @bio: bio to add 837c5cc2070STejun Heo * @qn: qnode to use 838c5cc2070STejun Heo * @tg: the target throtl_grp 839c5cc2070STejun Heo * 840c5cc2070STejun Heo * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 841c5cc2070STejun Heo * tg->qnode_on_self[] is used. 842c5cc2070STejun Heo */ 843c5cc2070STejun Heo static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 844c5cc2070STejun Heo struct throtl_grp *tg) 845e43473b7SVivek Goyal { 84673f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 847e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 848e43473b7SVivek Goyal 849c5cc2070STejun Heo if (!qn) 850c5cc2070STejun Heo qn = &tg->qnode_on_self[rw]; 851c5cc2070STejun Heo 8520e9f4164STejun Heo /* 8530e9f4164STejun Heo * If @tg doesn't currently have any bios queued in the same 8540e9f4164STejun Heo * direction, queueing @bio can change when @tg should be 8550e9f4164STejun Heo * dispatched. Mark that @tg was empty. This is automatically 8560e9f4164STejun Heo * cleaered on the next tg_update_disptime(). 8570e9f4164STejun Heo */ 8580e9f4164STejun Heo if (!sq->nr_queued[rw]) 8590e9f4164STejun Heo tg->flags |= THROTL_TG_WAS_EMPTY; 8600e9f4164STejun Heo 861c5cc2070STejun Heo throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 862c5cc2070STejun Heo 86373f0d49aSTejun Heo sq->nr_queued[rw]++; 86477216b04STejun Heo throtl_enqueue_tg(tg); 865e43473b7SVivek Goyal } 866e43473b7SVivek Goyal 86777216b04STejun Heo static void tg_update_disptime(struct throtl_grp *tg) 868e43473b7SVivek Goyal { 86973f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 870e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 871e43473b7SVivek Goyal struct bio *bio; 872e43473b7SVivek Goyal 873c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[READ]))) 8740f3457f6STejun Heo tg_may_dispatch(tg, bio, &read_wait); 875e43473b7SVivek Goyal 876c5cc2070STejun Heo if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 8770f3457f6STejun Heo tg_may_dispatch(tg, bio, &write_wait); 878e43473b7SVivek Goyal 879e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 880e43473b7SVivek Goyal disptime = jiffies + min_wait; 881e43473b7SVivek Goyal 882e43473b7SVivek Goyal /* Update dispatch time */ 88377216b04STejun Heo throtl_dequeue_tg(tg); 884e43473b7SVivek Goyal tg->disptime = disptime; 88577216b04STejun Heo throtl_enqueue_tg(tg); 8860e9f4164STejun Heo 8870e9f4164STejun Heo /* see throtl_add_bio_tg() */ 8880e9f4164STejun Heo tg->flags &= ~THROTL_TG_WAS_EMPTY; 889e43473b7SVivek Goyal } 890e43473b7SVivek Goyal 89132ee5bc4SVivek Goyal static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 89232ee5bc4SVivek Goyal struct throtl_grp *parent_tg, bool rw) 89332ee5bc4SVivek Goyal { 89432ee5bc4SVivek Goyal if (throtl_slice_used(parent_tg, rw)) { 89532ee5bc4SVivek Goyal throtl_start_new_slice_with_credit(parent_tg, rw, 89632ee5bc4SVivek Goyal child_tg->slice_start[rw]); 89732ee5bc4SVivek Goyal } 89832ee5bc4SVivek Goyal 89932ee5bc4SVivek Goyal } 90032ee5bc4SVivek Goyal 90177216b04STejun Heo static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 902e43473b7SVivek Goyal { 90373f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 9046bc9c2b4STejun Heo struct throtl_service_queue *parent_sq = sq->parent_sq; 9056bc9c2b4STejun Heo struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 906c5cc2070STejun Heo struct throtl_grp *tg_to_put = NULL; 907e43473b7SVivek Goyal struct bio *bio; 908e43473b7SVivek Goyal 909c5cc2070STejun Heo /* 910c5cc2070STejun Heo * @bio is being transferred from @tg to @parent_sq. Popping a bio 911c5cc2070STejun Heo * from @tg may put its reference and @parent_sq might end up 912c5cc2070STejun Heo * getting released prematurely. Remember the tg to put and put it 913c5cc2070STejun Heo * after @bio is transferred to @parent_sq. 914c5cc2070STejun Heo */ 915c5cc2070STejun Heo bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 91673f0d49aSTejun Heo sq->nr_queued[rw]--; 917e43473b7SVivek Goyal 918e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 9196bc9c2b4STejun Heo 9206bc9c2b4STejun Heo /* 9216bc9c2b4STejun Heo * If our parent is another tg, we just need to transfer @bio to 9226bc9c2b4STejun Heo * the parent using throtl_add_bio_tg(). If our parent is 9236bc9c2b4STejun Heo * @td->service_queue, @bio is ready to be issued. Put it on its 9246bc9c2b4STejun Heo * bio_lists[] and decrease total number queued. The caller is 9256bc9c2b4STejun Heo * responsible for issuing these bios. 9266bc9c2b4STejun Heo */ 9276bc9c2b4STejun Heo if (parent_tg) { 928c5cc2070STejun Heo throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 92932ee5bc4SVivek Goyal start_parent_slice_with_credit(tg, parent_tg, rw); 9306bc9c2b4STejun Heo } else { 931c5cc2070STejun Heo throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 932c5cc2070STejun Heo &parent_sq->queued[rw]); 9336bc9c2b4STejun Heo BUG_ON(tg->td->nr_queued[rw] <= 0); 9346bc9c2b4STejun Heo tg->td->nr_queued[rw]--; 9356bc9c2b4STejun Heo } 936e43473b7SVivek Goyal 9370f3457f6STejun Heo throtl_trim_slice(tg, rw); 9386bc9c2b4STejun Heo 939c5cc2070STejun Heo if (tg_to_put) 940c5cc2070STejun Heo blkg_put(tg_to_blkg(tg_to_put)); 941e43473b7SVivek Goyal } 942e43473b7SVivek Goyal 94377216b04STejun Heo static int throtl_dispatch_tg(struct throtl_grp *tg) 944e43473b7SVivek Goyal { 94573f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 946e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 947e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 948c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 949e43473b7SVivek Goyal struct bio *bio; 950e43473b7SVivek Goyal 951e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 952e43473b7SVivek Goyal 953c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ])) && 9540f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 955e43473b7SVivek Goyal 95677216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 957e43473b7SVivek Goyal nr_reads++; 958e43473b7SVivek Goyal 959e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 960e43473b7SVivek Goyal break; 961e43473b7SVivek Goyal } 962e43473b7SVivek Goyal 963c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 9640f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 965e43473b7SVivek Goyal 96677216b04STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 967e43473b7SVivek Goyal nr_writes++; 968e43473b7SVivek Goyal 969e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 970e43473b7SVivek Goyal break; 971e43473b7SVivek Goyal } 972e43473b7SVivek Goyal 973e43473b7SVivek Goyal return nr_reads + nr_writes; 974e43473b7SVivek Goyal } 975e43473b7SVivek Goyal 976651930bcSTejun Heo static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 977e43473b7SVivek Goyal { 978e43473b7SVivek Goyal unsigned int nr_disp = 0; 979e43473b7SVivek Goyal 980e43473b7SVivek Goyal while (1) { 98173f0d49aSTejun Heo struct throtl_grp *tg = throtl_rb_first(parent_sq); 98273f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 983e43473b7SVivek Goyal 984e43473b7SVivek Goyal if (!tg) 985e43473b7SVivek Goyal break; 986e43473b7SVivek Goyal 987e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 988e43473b7SVivek Goyal break; 989e43473b7SVivek Goyal 99077216b04STejun Heo throtl_dequeue_tg(tg); 991e43473b7SVivek Goyal 99277216b04STejun Heo nr_disp += throtl_dispatch_tg(tg); 993e43473b7SVivek Goyal 99473f0d49aSTejun Heo if (sq->nr_queued[0] || sq->nr_queued[1]) 99577216b04STejun Heo tg_update_disptime(tg); 996e43473b7SVivek Goyal 997e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 998e43473b7SVivek Goyal break; 999e43473b7SVivek Goyal } 1000e43473b7SVivek Goyal 1001e43473b7SVivek Goyal return nr_disp; 1002e43473b7SVivek Goyal } 1003e43473b7SVivek Goyal 10046e1a5704STejun Heo /** 10056e1a5704STejun Heo * throtl_pending_timer_fn - timer function for service_queue->pending_timer 10066e1a5704STejun Heo * @arg: the throtl_service_queue being serviced 10076e1a5704STejun Heo * 10086e1a5704STejun Heo * This timer is armed when a child throtl_grp with active bio's become 10096e1a5704STejun Heo * pending and queued on the service_queue's pending_tree and expires when 10106e1a5704STejun Heo * the first child throtl_grp should be dispatched. This function 10112e48a530STejun Heo * dispatches bio's from the children throtl_grps to the parent 10122e48a530STejun Heo * service_queue. 10132e48a530STejun Heo * 10142e48a530STejun Heo * If the parent's parent is another throtl_grp, dispatching is propagated 10152e48a530STejun Heo * by either arming its pending_timer or repeating dispatch directly. If 10162e48a530STejun Heo * the top-level service_tree is reached, throtl_data->dispatch_work is 10172e48a530STejun Heo * kicked so that the ready bio's are issued. 10186e1a5704STejun Heo */ 101969df0ab0STejun Heo static void throtl_pending_timer_fn(unsigned long arg) 102069df0ab0STejun Heo { 102169df0ab0STejun Heo struct throtl_service_queue *sq = (void *)arg; 10222e48a530STejun Heo struct throtl_grp *tg = sq_to_tg(sq); 102369df0ab0STejun Heo struct throtl_data *td = sq_to_td(sq); 1024cb76199cSTejun Heo struct request_queue *q = td->queue; 10252e48a530STejun Heo struct throtl_service_queue *parent_sq; 10262e48a530STejun Heo bool dispatched; 10276e1a5704STejun Heo int ret; 1028e43473b7SVivek Goyal 1029e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 10302e48a530STejun Heo again: 10312e48a530STejun Heo parent_sq = sq->parent_sq; 10322e48a530STejun Heo dispatched = false; 1033e43473b7SVivek Goyal 10347f52f98cSTejun Heo while (true) { 1035fda6f272STejun Heo throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 10362e48a530STejun Heo sq->nr_queued[READ] + sq->nr_queued[WRITE], 10372e48a530STejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1038e43473b7SVivek Goyal 10397f52f98cSTejun Heo ret = throtl_select_dispatch(sq); 10407f52f98cSTejun Heo if (ret) { 10417f52f98cSTejun Heo throtl_log(sq, "bios disp=%u", ret); 10427f52f98cSTejun Heo dispatched = true; 1043651930bcSTejun Heo } 1044e43473b7SVivek Goyal 10457f52f98cSTejun Heo if (throtl_schedule_next_dispatch(sq, false)) 10467f52f98cSTejun Heo break; 10477f52f98cSTejun Heo 10487f52f98cSTejun Heo /* this dispatch windows is still open, relax and repeat */ 10497f52f98cSTejun Heo spin_unlock_irq(q->queue_lock); 10507f52f98cSTejun Heo cpu_relax(); 10517f52f98cSTejun Heo spin_lock_irq(q->queue_lock); 10527f52f98cSTejun Heo } 10536a525600STejun Heo 10542e48a530STejun Heo if (!dispatched) 10552e48a530STejun Heo goto out_unlock; 10566e1a5704STejun Heo 10572e48a530STejun Heo if (parent_sq) { 10582e48a530STejun Heo /* @parent_sq is another throl_grp, propagate dispatch */ 10592e48a530STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 10602e48a530STejun Heo tg_update_disptime(tg); 10612e48a530STejun Heo if (!throtl_schedule_next_dispatch(parent_sq, false)) { 10622e48a530STejun Heo /* window is already open, repeat dispatching */ 10632e48a530STejun Heo sq = parent_sq; 10642e48a530STejun Heo tg = sq_to_tg(sq); 10652e48a530STejun Heo goto again; 10662e48a530STejun Heo } 10672e48a530STejun Heo } 10682e48a530STejun Heo } else { 10692e48a530STejun Heo /* reached the top-level, queue issueing */ 10702e48a530STejun Heo queue_work(kthrotld_workqueue, &td->dispatch_work); 10712e48a530STejun Heo } 10722e48a530STejun Heo out_unlock: 10736e1a5704STejun Heo spin_unlock_irq(q->queue_lock); 10746e1a5704STejun Heo } 10756e1a5704STejun Heo 10766e1a5704STejun Heo /** 10776e1a5704STejun Heo * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 10786e1a5704STejun Heo * @work: work item being executed 10796e1a5704STejun Heo * 10806e1a5704STejun Heo * This function is queued for execution when bio's reach the bio_lists[] 10816e1a5704STejun Heo * of throtl_data->service_queue. Those bio's are ready and issued by this 10826e1a5704STejun Heo * function. 10836e1a5704STejun Heo */ 10848876e140SFabian Frederick static void blk_throtl_dispatch_work_fn(struct work_struct *work) 10856e1a5704STejun Heo { 10866e1a5704STejun Heo struct throtl_data *td = container_of(work, struct throtl_data, 10876e1a5704STejun Heo dispatch_work); 10886e1a5704STejun Heo struct throtl_service_queue *td_sq = &td->service_queue; 10896e1a5704STejun Heo struct request_queue *q = td->queue; 10906e1a5704STejun Heo struct bio_list bio_list_on_stack; 10916e1a5704STejun Heo struct bio *bio; 10926e1a5704STejun Heo struct blk_plug plug; 10936e1a5704STejun Heo int rw; 10946e1a5704STejun Heo 10956e1a5704STejun Heo bio_list_init(&bio_list_on_stack); 10966e1a5704STejun Heo 10976e1a5704STejun Heo spin_lock_irq(q->queue_lock); 1098c5cc2070STejun Heo for (rw = READ; rw <= WRITE; rw++) 1099c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1100c5cc2070STejun Heo bio_list_add(&bio_list_on_stack, bio); 1101e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1102e43473b7SVivek Goyal 11036e1a5704STejun Heo if (!bio_list_empty(&bio_list_on_stack)) { 110469d60eb9SVivek Goyal blk_start_plug(&plug); 1105e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 1106e43473b7SVivek Goyal generic_make_request(bio); 110769d60eb9SVivek Goyal blk_finish_plug(&plug); 1108e43473b7SVivek Goyal } 1109e43473b7SVivek Goyal } 1110e43473b7SVivek Goyal 1111f95a04afSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1112f95a04afSTejun Heo int off) 111360c2bc2dSTejun Heo { 1114f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1115f95a04afSTejun Heo u64 v = *(u64 *)((void *)tg + off); 111660c2bc2dSTejun Heo 1117af133cebSTejun Heo if (v == -1) 111860c2bc2dSTejun Heo return 0; 1119f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 112060c2bc2dSTejun Heo } 112160c2bc2dSTejun Heo 1122f95a04afSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1123f95a04afSTejun Heo int off) 1124af133cebSTejun Heo { 1125f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 1126f95a04afSTejun Heo unsigned int v = *(unsigned int *)((void *)tg + off); 1127af133cebSTejun Heo 1128af133cebSTejun Heo if (v == -1) 1129af133cebSTejun Heo return 0; 1130f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 1131af133cebSTejun Heo } 1132af133cebSTejun Heo 11332da8ca82STejun Heo static int tg_print_conf_u64(struct seq_file *sf, void *v) 113460c2bc2dSTejun Heo { 11352da8ca82STejun Heo blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 11362da8ca82STejun Heo &blkcg_policy_throtl, seq_cft(sf)->private, false); 113760c2bc2dSTejun Heo return 0; 113860c2bc2dSTejun Heo } 113960c2bc2dSTejun Heo 11402da8ca82STejun Heo static int tg_print_conf_uint(struct seq_file *sf, void *v) 1141e43473b7SVivek Goyal { 11422da8ca82STejun Heo blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 11432da8ca82STejun Heo &blkcg_policy_throtl, seq_cft(sf)->private, false); 1144af133cebSTejun Heo return 0; 1145e43473b7SVivek Goyal } 1146e43473b7SVivek Goyal 1147451af504STejun Heo static ssize_t tg_set_conf(struct kernfs_open_file *of, 1148451af504STejun Heo char *buf, size_t nbytes, loff_t off, bool is_u64) 114960c2bc2dSTejun Heo { 1150451af504STejun Heo struct blkcg *blkcg = css_to_blkcg(of_css(of)); 115160c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 1152af133cebSTejun Heo struct throtl_grp *tg; 115369df0ab0STejun Heo struct throtl_service_queue *sq; 1154693e751eSTejun Heo struct blkcg_gq *blkg; 1155492eb21bSTejun Heo struct cgroup_subsys_state *pos_css; 115660c2bc2dSTejun Heo int ret; 115760c2bc2dSTejun Heo 11583c798398STejun Heo ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 115960c2bc2dSTejun Heo if (ret) 116060c2bc2dSTejun Heo return ret; 116160c2bc2dSTejun Heo 1162af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 116369df0ab0STejun Heo sq = &tg->service_queue; 1164af133cebSTejun Heo 1165af133cebSTejun Heo if (!ctx.v) 1166af133cebSTejun Heo ctx.v = -1; 1167af133cebSTejun Heo 1168af133cebSTejun Heo if (is_u64) 1169451af504STejun Heo *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; 1170af133cebSTejun Heo else 1171451af504STejun Heo *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; 1172af133cebSTejun Heo 1173fda6f272STejun Heo throtl_log(&tg->service_queue, 1174fda6f272STejun Heo "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1175632b4493STejun Heo tg->bps[READ], tg->bps[WRITE], 1176632b4493STejun Heo tg->iops[READ], tg->iops[WRITE]); 1177632b4493STejun Heo 1178632b4493STejun Heo /* 1179693e751eSTejun Heo * Update has_rules[] flags for the updated tg's subtree. A tg is 1180693e751eSTejun Heo * considered to have rules if either the tg itself or any of its 1181693e751eSTejun Heo * ancestors has rules. This identifies groups without any 1182693e751eSTejun Heo * restrictions in the whole hierarchy and allows them to bypass 1183693e751eSTejun Heo * blk-throttle. 1184693e751eSTejun Heo */ 1185492eb21bSTejun Heo blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) 1186693e751eSTejun Heo tg_update_has_rules(blkg_to_tg(blkg)); 1187693e751eSTejun Heo 1188693e751eSTejun Heo /* 1189632b4493STejun Heo * We're already holding queue_lock and know @tg is valid. Let's 1190632b4493STejun Heo * apply the new config directly. 1191632b4493STejun Heo * 1192632b4493STejun Heo * Restart the slices for both READ and WRITES. It might happen 1193632b4493STejun Heo * that a group's limit are dropped suddenly and we don't want to 1194632b4493STejun Heo * account recently dispatched IO with new low rate. 1195632b4493STejun Heo */ 11960f3457f6STejun Heo throtl_start_new_slice(tg, 0); 11970f3457f6STejun Heo throtl_start_new_slice(tg, 1); 1198632b4493STejun Heo 11995b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) { 120077216b04STejun Heo tg_update_disptime(tg); 12017f52f98cSTejun Heo throtl_schedule_next_dispatch(sq->parent_sq, true); 1202632b4493STejun Heo } 1203af133cebSTejun Heo 120460c2bc2dSTejun Heo blkg_conf_finish(&ctx); 1205451af504STejun Heo return nbytes; 120660c2bc2dSTejun Heo } 120760c2bc2dSTejun Heo 1208451af504STejun Heo static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1209451af504STejun Heo char *buf, size_t nbytes, loff_t off) 121060c2bc2dSTejun Heo { 1211451af504STejun Heo return tg_set_conf(of, buf, nbytes, off, true); 121260c2bc2dSTejun Heo } 121360c2bc2dSTejun Heo 1214451af504STejun Heo static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1215451af504STejun Heo char *buf, size_t nbytes, loff_t off) 121660c2bc2dSTejun Heo { 1217451af504STejun Heo return tg_set_conf(of, buf, nbytes, off, false); 121860c2bc2dSTejun Heo } 121960c2bc2dSTejun Heo 122060c2bc2dSTejun Heo static struct cftype throtl_files[] = { 122160c2bc2dSTejun Heo { 122260c2bc2dSTejun Heo .name = "throttle.read_bps_device", 1223af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 12242da8ca82STejun Heo .seq_show = tg_print_conf_u64, 1225451af504STejun Heo .write = tg_set_conf_u64, 122660c2bc2dSTejun Heo }, 122760c2bc2dSTejun Heo { 122860c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1229af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 12302da8ca82STejun Heo .seq_show = tg_print_conf_u64, 1231451af504STejun Heo .write = tg_set_conf_u64, 123260c2bc2dSTejun Heo }, 123360c2bc2dSTejun Heo { 123460c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1235af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 12362da8ca82STejun Heo .seq_show = tg_print_conf_uint, 1237451af504STejun Heo .write = tg_set_conf_uint, 123860c2bc2dSTejun Heo }, 123960c2bc2dSTejun Heo { 124060c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1241af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 12422da8ca82STejun Heo .seq_show = tg_print_conf_uint, 1243451af504STejun Heo .write = tg_set_conf_uint, 124460c2bc2dSTejun Heo }, 124560c2bc2dSTejun Heo { 124660c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 124777ea7338STejun Heo .private = (unsigned long)&blkcg_policy_throtl, 124877ea7338STejun Heo .seq_show = blkg_print_stat_bytes, 124960c2bc2dSTejun Heo }, 125060c2bc2dSTejun Heo { 125160c2bc2dSTejun Heo .name = "throttle.io_serviced", 125277ea7338STejun Heo .private = (unsigned long)&blkcg_policy_throtl, 125377ea7338STejun Heo .seq_show = blkg_print_stat_ios, 125460c2bc2dSTejun Heo }, 125560c2bc2dSTejun Heo { } /* terminate */ 125660c2bc2dSTejun Heo }; 125760c2bc2dSTejun Heo 1258da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1259e43473b7SVivek Goyal { 1260e43473b7SVivek Goyal struct throtl_data *td = q->td; 1261e43473b7SVivek Goyal 126269df0ab0STejun Heo cancel_work_sync(&td->dispatch_work); 1263e43473b7SVivek Goyal } 1264e43473b7SVivek Goyal 12653c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl = { 1266f9fcc2d3STejun Heo .cftypes = throtl_files, 1267f9fcc2d3STejun Heo 1268001bea73STejun Heo .pd_alloc_fn = throtl_pd_alloc, 12693c798398STejun Heo .pd_init_fn = throtl_pd_init, 1270693e751eSTejun Heo .pd_online_fn = throtl_pd_online, 1271001bea73STejun Heo .pd_free_fn = throtl_pd_free, 1272e43473b7SVivek Goyal }; 1273e43473b7SVivek Goyal 1274ae118896STejun Heo bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 1275ae118896STejun Heo struct bio *bio) 1276e43473b7SVivek Goyal { 1277c5cc2070STejun Heo struct throtl_qnode *qn = NULL; 1278ae118896STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); 127973f0d49aSTejun Heo struct throtl_service_queue *sq; 12800e9f4164STejun Heo bool rw = bio_data_dir(bio); 1281bc16a4f9STejun Heo bool throttled = false; 1282e43473b7SVivek Goyal 1283ae118896STejun Heo WARN_ON_ONCE(!rcu_read_lock_held()); 1284ae118896STejun Heo 12852a0f61e6STejun Heo /* see throtl_charge_bio() */ 1286ae118896STejun Heo if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) 1287bc16a4f9STejun Heo goto out; 1288e43473b7SVivek Goyal 1289e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1290c9589f03STejun Heo 1291c9589f03STejun Heo if (unlikely(blk_queue_bypass(q))) 1292bc16a4f9STejun Heo goto out_unlock; 1293f469a7b4SVivek Goyal 129473f0d49aSTejun Heo sq = &tg->service_queue; 129573f0d49aSTejun Heo 12969e660acfSTejun Heo while (true) { 12979e660acfSTejun Heo /* throtl is FIFO - if bios are already queued, should queue */ 12980e9f4164STejun Heo if (sq->nr_queued[rw]) 12999e660acfSTejun Heo break; 1300de701c74SVivek Goyal 13019e660acfSTejun Heo /* if above limits, break to queue */ 13029e660acfSTejun Heo if (!tg_may_dispatch(tg, bio, NULL)) 13039e660acfSTejun Heo break; 13049e660acfSTejun Heo 13059e660acfSTejun Heo /* within limits, let's charge and dispatch directly */ 1306e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 130704521db0SVivek Goyal 130804521db0SVivek Goyal /* 130904521db0SVivek Goyal * We need to trim slice even when bios are not being queued 131004521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 131104521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 131204521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 131304521db0SVivek Goyal * we take into account all the IO dispatched so far at new 131404521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 131504521db0SVivek Goyal * time. 131604521db0SVivek Goyal * 131704521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 131804521db0SVivek Goyal */ 13190f3457f6STejun Heo throtl_trim_slice(tg, rw); 13209e660acfSTejun Heo 13219e660acfSTejun Heo /* 13229e660acfSTejun Heo * @bio passed through this layer without being throttled. 13239e660acfSTejun Heo * Climb up the ladder. If we''re already at the top, it 13249e660acfSTejun Heo * can be executed directly. 13259e660acfSTejun Heo */ 1326c5cc2070STejun Heo qn = &tg->qnode_on_parent[rw]; 13279e660acfSTejun Heo sq = sq->parent_sq; 13289e660acfSTejun Heo tg = sq_to_tg(sq); 13299e660acfSTejun Heo if (!tg) 1330bc16a4f9STejun Heo goto out_unlock; 1331e43473b7SVivek Goyal } 1332e43473b7SVivek Goyal 13339e660acfSTejun Heo /* out-of-limit, queue to @tg */ 1334fda6f272STejun Heo throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 13358e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 13364f024f37SKent Overstreet tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], 13378e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 133873f0d49aSTejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1339e43473b7SVivek Goyal 1340671058fbSTejun Heo bio_associate_current(bio); 13416bc9c2b4STejun Heo tg->td->nr_queued[rw]++; 1342c5cc2070STejun Heo throtl_add_bio_tg(bio, qn, tg); 1343bc16a4f9STejun Heo throttled = true; 1344e43473b7SVivek Goyal 13457f52f98cSTejun Heo /* 13467f52f98cSTejun Heo * Update @tg's dispatch time and force schedule dispatch if @tg 13477f52f98cSTejun Heo * was empty before @bio. The forced scheduling isn't likely to 13487f52f98cSTejun Heo * cause undue delay as @bio is likely to be dispatched directly if 13497f52f98cSTejun Heo * its @tg's disptime is not in the future. 13507f52f98cSTejun Heo */ 13510e9f4164STejun Heo if (tg->flags & THROTL_TG_WAS_EMPTY) { 135277216b04STejun Heo tg_update_disptime(tg); 13537f52f98cSTejun Heo throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1354e43473b7SVivek Goyal } 1355e43473b7SVivek Goyal 1356bc16a4f9STejun Heo out_unlock: 1357e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1358bc16a4f9STejun Heo out: 13592a0f61e6STejun Heo /* 13602a0f61e6STejun Heo * As multiple blk-throtls may stack in the same issue path, we 13612a0f61e6STejun Heo * don't want bios to leave with the flag set. Clear the flag if 13622a0f61e6STejun Heo * being issued. 13632a0f61e6STejun Heo */ 13642a0f61e6STejun Heo if (!throttled) 13652a0f61e6STejun Heo bio->bi_rw &= ~REQ_THROTTLED; 1366bc16a4f9STejun Heo return throttled; 1367e43473b7SVivek Goyal } 1368e43473b7SVivek Goyal 13692a12f0dcSTejun Heo /* 13702a12f0dcSTejun Heo * Dispatch all bios from all children tg's queued on @parent_sq. On 13712a12f0dcSTejun Heo * return, @parent_sq is guaranteed to not have any active children tg's 13722a12f0dcSTejun Heo * and all bios from previously active tg's are on @parent_sq->bio_lists[]. 13732a12f0dcSTejun Heo */ 13742a12f0dcSTejun Heo static void tg_drain_bios(struct throtl_service_queue *parent_sq) 13752a12f0dcSTejun Heo { 13762a12f0dcSTejun Heo struct throtl_grp *tg; 13772a12f0dcSTejun Heo 13782a12f0dcSTejun Heo while ((tg = throtl_rb_first(parent_sq))) { 13792a12f0dcSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 13802a12f0dcSTejun Heo struct bio *bio; 13812a12f0dcSTejun Heo 13822a12f0dcSTejun Heo throtl_dequeue_tg(tg); 13832a12f0dcSTejun Heo 1384c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[READ]))) 13852a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1386c5cc2070STejun Heo while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 13872a12f0dcSTejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio)); 13882a12f0dcSTejun Heo } 13892a12f0dcSTejun Heo } 13902a12f0dcSTejun Heo 1391c9a929ddSTejun Heo /** 1392c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1393c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1394c9a929ddSTejun Heo * 1395c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1396c9a929ddSTejun Heo */ 1397c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1398c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1399c9a929ddSTejun Heo { 1400c9a929ddSTejun Heo struct throtl_data *td = q->td; 14012a12f0dcSTejun Heo struct blkcg_gq *blkg; 1402492eb21bSTejun Heo struct cgroup_subsys_state *pos_css; 1403c9a929ddSTejun Heo struct bio *bio; 1404651930bcSTejun Heo int rw; 1405c9a929ddSTejun Heo 14068bcb6c7dSAndi Kleen queue_lockdep_assert_held(q); 14072a12f0dcSTejun Heo rcu_read_lock(); 1408c9a929ddSTejun Heo 14092a12f0dcSTejun Heo /* 14102a12f0dcSTejun Heo * Drain each tg while doing post-order walk on the blkg tree, so 14112a12f0dcSTejun Heo * that all bios are propagated to td->service_queue. It'd be 14122a12f0dcSTejun Heo * better to walk service_queue tree directly but blkg walk is 14132a12f0dcSTejun Heo * easier. 14142a12f0dcSTejun Heo */ 1415492eb21bSTejun Heo blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) 14162a12f0dcSTejun Heo tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 141773f0d49aSTejun Heo 14182a12f0dcSTejun Heo /* finally, transfer bios from top-level tg's into the td */ 14192a12f0dcSTejun Heo tg_drain_bios(&td->service_queue); 14202a12f0dcSTejun Heo 14212a12f0dcSTejun Heo rcu_read_unlock(); 1422c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1423c9a929ddSTejun Heo 14242a12f0dcSTejun Heo /* all bios now should be in td->service_queue, issue them */ 1425651930bcSTejun Heo for (rw = READ; rw <= WRITE; rw++) 1426c5cc2070STejun Heo while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], 1427c5cc2070STejun Heo NULL))) 1428c9a929ddSTejun Heo generic_make_request(bio); 1429c9a929ddSTejun Heo 1430c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1431c9a929ddSTejun Heo } 1432c9a929ddSTejun Heo 1433e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1434e43473b7SVivek Goyal { 1435e43473b7SVivek Goyal struct throtl_data *td; 1436a2b1693bSTejun Heo int ret; 1437e43473b7SVivek Goyal 1438e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1439e43473b7SVivek Goyal if (!td) 1440e43473b7SVivek Goyal return -ENOMEM; 1441e43473b7SVivek Goyal 144269df0ab0STejun Heo INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1443b2ce2643STejun Heo throtl_service_queue_init(&td->service_queue); 1444e43473b7SVivek Goyal 1445cd1604faSTejun Heo q->td = td; 144629b12589SVivek Goyal td->queue = q; 144702977e4aSVivek Goyal 1448a2b1693bSTejun Heo /* activate policy */ 14493c798398STejun Heo ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1450a2b1693bSTejun Heo if (ret) 145129b12589SVivek Goyal kfree(td); 1452a2b1693bSTejun Heo return ret; 1453e43473b7SVivek Goyal } 1454e43473b7SVivek Goyal 1455e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1456e43473b7SVivek Goyal { 1457c875f4d0STejun Heo BUG_ON(!q->td); 1458da527770SVivek Goyal throtl_shutdown_wq(q); 14593c798398STejun Heo blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1460c9a929ddSTejun Heo kfree(q->td); 1461e43473b7SVivek Goyal } 1462e43473b7SVivek Goyal 1463e43473b7SVivek Goyal static int __init throtl_init(void) 1464e43473b7SVivek Goyal { 1465450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1466450adcbeSVivek Goyal if (!kthrotld_workqueue) 1467450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1468450adcbeSVivek Goyal 14693c798398STejun Heo return blkcg_policy_register(&blkcg_policy_throtl); 1470e43473b7SVivek Goyal } 1471e43473b7SVivek Goyal 1472e43473b7SVivek Goyal module_init(throtl_init); 1473