xref: /linux/block/blk-throttle.c (revision a2b1693bac45ea3fe3ba612fd22c45f17449f610)
1e43473b7SVivek Goyal /*
2e43473b7SVivek Goyal  * Interface for controlling IO bandwidth on a request queue
3e43473b7SVivek Goyal  *
4e43473b7SVivek Goyal  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5e43473b7SVivek Goyal  */
6e43473b7SVivek Goyal 
7e43473b7SVivek Goyal #include <linux/module.h>
8e43473b7SVivek Goyal #include <linux/slab.h>
9e43473b7SVivek Goyal #include <linux/blkdev.h>
10e43473b7SVivek Goyal #include <linux/bio.h>
11e43473b7SVivek Goyal #include <linux/blktrace_api.h>
12e43473b7SVivek Goyal #include "blk-cgroup.h"
13bc9fcbf9STejun Heo #include "blk.h"
14e43473b7SVivek Goyal 
15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */
16e43473b7SVivek Goyal static int throtl_grp_quantum = 8;
17e43473b7SVivek Goyal 
18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */
19e43473b7SVivek Goyal static int throtl_quantum = 32;
20e43473b7SVivek Goyal 
21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */
22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10;	/* 100 ms */
23e43473b7SVivek Goyal 
240381411eSTejun Heo static struct blkio_policy_type blkio_policy_throtl;
250381411eSTejun Heo 
26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */
27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue;
28450adcbeSVivek Goyal static void throtl_schedule_delayed_work(struct throtl_data *td,
29450adcbeSVivek Goyal 				unsigned long delay);
30450adcbeSVivek Goyal 
31e43473b7SVivek Goyal struct throtl_rb_root {
32e43473b7SVivek Goyal 	struct rb_root rb;
33e43473b7SVivek Goyal 	struct rb_node *left;
34e43473b7SVivek Goyal 	unsigned int count;
35e43473b7SVivek Goyal 	unsigned long min_disptime;
36e43473b7SVivek Goyal };
37e43473b7SVivek Goyal 
38e43473b7SVivek Goyal #define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
39e43473b7SVivek Goyal 			.count = 0, .min_disptime = 0}
40e43473b7SVivek Goyal 
41e43473b7SVivek Goyal #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
42e43473b7SVivek Goyal 
438a3d2615STejun Heo /* Per-cpu group stats */
448a3d2615STejun Heo struct tg_stats_cpu {
458a3d2615STejun Heo 	/* total bytes transferred */
468a3d2615STejun Heo 	struct blkg_rwstat		service_bytes;
478a3d2615STejun Heo 	/* total IOs serviced, post merge */
488a3d2615STejun Heo 	struct blkg_rwstat		serviced;
498a3d2615STejun Heo };
508a3d2615STejun Heo 
51e43473b7SVivek Goyal struct throtl_grp {
52e43473b7SVivek Goyal 	/* active throtl group service_tree member */
53e43473b7SVivek Goyal 	struct rb_node rb_node;
54e43473b7SVivek Goyal 
55e43473b7SVivek Goyal 	/*
56e43473b7SVivek Goyal 	 * Dispatch time in jiffies. This is the estimated time when group
57e43473b7SVivek Goyal 	 * will unthrottle and is ready to dispatch more bio. It is used as
58e43473b7SVivek Goyal 	 * key to sort active groups in service tree.
59e43473b7SVivek Goyal 	 */
60e43473b7SVivek Goyal 	unsigned long disptime;
61e43473b7SVivek Goyal 
62e43473b7SVivek Goyal 	unsigned int flags;
63e43473b7SVivek Goyal 
64e43473b7SVivek Goyal 	/* Two lists for READ and WRITE */
65e43473b7SVivek Goyal 	struct bio_list bio_lists[2];
66e43473b7SVivek Goyal 
67e43473b7SVivek Goyal 	/* Number of queued bios on READ and WRITE lists */
68e43473b7SVivek Goyal 	unsigned int nr_queued[2];
69e43473b7SVivek Goyal 
70e43473b7SVivek Goyal 	/* bytes per second rate limits */
71e43473b7SVivek Goyal 	uint64_t bps[2];
72e43473b7SVivek Goyal 
738e89d13fSVivek Goyal 	/* IOPS limits */
748e89d13fSVivek Goyal 	unsigned int iops[2];
758e89d13fSVivek Goyal 
76e43473b7SVivek Goyal 	/* Number of bytes disptached in current slice */
77e43473b7SVivek Goyal 	uint64_t bytes_disp[2];
788e89d13fSVivek Goyal 	/* Number of bio's dispatched in current slice */
798e89d13fSVivek Goyal 	unsigned int io_disp[2];
80e43473b7SVivek Goyal 
81e43473b7SVivek Goyal 	/* When did we start a new slice */
82e43473b7SVivek Goyal 	unsigned long slice_start[2];
83e43473b7SVivek Goyal 	unsigned long slice_end[2];
84fe071437SVivek Goyal 
85fe071437SVivek Goyal 	/* Some throttle limits got updated for the group */
866f037937SAndreas Schwab 	int limits_changed;
878a3d2615STejun Heo 
888a3d2615STejun Heo 	/* Per cpu stats pointer */
898a3d2615STejun Heo 	struct tg_stats_cpu __percpu *stats_cpu;
908a3d2615STejun Heo 
918a3d2615STejun Heo 	/* List of tgs waiting for per cpu stats memory to be allocated */
928a3d2615STejun Heo 	struct list_head stats_alloc_node;
93e43473b7SVivek Goyal };
94e43473b7SVivek Goyal 
95e43473b7SVivek Goyal struct throtl_data
96e43473b7SVivek Goyal {
97e43473b7SVivek Goyal 	/* service tree for active throtl groups */
98e43473b7SVivek Goyal 	struct throtl_rb_root tg_service_tree;
99e43473b7SVivek Goyal 
100e43473b7SVivek Goyal 	struct request_queue *queue;
101e43473b7SVivek Goyal 
102e43473b7SVivek Goyal 	/* Total Number of queued bios on READ and WRITE lists */
103e43473b7SVivek Goyal 	unsigned int nr_queued[2];
104e43473b7SVivek Goyal 
105e43473b7SVivek Goyal 	/*
10602977e4aSVivek Goyal 	 * number of total undestroyed groups
107e43473b7SVivek Goyal 	 */
108e43473b7SVivek Goyal 	unsigned int nr_undestroyed_grps;
109e43473b7SVivek Goyal 
110e43473b7SVivek Goyal 	/* Work for dispatching throttled bios */
111e43473b7SVivek Goyal 	struct delayed_work throtl_work;
112fe071437SVivek Goyal 
1136f037937SAndreas Schwab 	int limits_changed;
114e43473b7SVivek Goyal };
115e43473b7SVivek Goyal 
1168a3d2615STejun Heo /* list and work item to allocate percpu group stats */
1178a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock);
1188a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list);
1198a3d2615STejun Heo 
1208a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *);
1218a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
1228a3d2615STejun Heo 
1230381411eSTejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
1240381411eSTejun Heo {
1250381411eSTejun Heo 	return blkg_to_pdata(blkg, &blkio_policy_throtl);
1260381411eSTejun Heo }
1270381411eSTejun Heo 
1280381411eSTejun Heo static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
1290381411eSTejun Heo {
130aaec55a0STejun Heo 	return pdata_to_blkg(tg);
1310381411eSTejun Heo }
1320381411eSTejun Heo 
13303d8e111STejun Heo static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
13403d8e111STejun Heo {
13503d8e111STejun Heo 	return blkg_to_tg(td->queue->root_blkg);
13603d8e111STejun Heo }
13703d8e111STejun Heo 
138e43473b7SVivek Goyal enum tg_state_flags {
139e43473b7SVivek Goyal 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
140e43473b7SVivek Goyal };
141e43473b7SVivek Goyal 
142e43473b7SVivek Goyal #define THROTL_TG_FNS(name)						\
143e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
144e43473b7SVivek Goyal {									\
145e43473b7SVivek Goyal 	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
146e43473b7SVivek Goyal }									\
147e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
148e43473b7SVivek Goyal {									\
149e43473b7SVivek Goyal 	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
150e43473b7SVivek Goyal }									\
151e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
152e43473b7SVivek Goyal {									\
153e43473b7SVivek Goyal 	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
154e43473b7SVivek Goyal }
155e43473b7SVivek Goyal 
156e43473b7SVivek Goyal THROTL_TG_FNS(on_rr);
157e43473b7SVivek Goyal 
158e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...)				\
159e43473b7SVivek Goyal 	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
1600381411eSTejun Heo 			  blkg_path(tg_to_blkg(tg)), ##args);		\
161e43473b7SVivek Goyal 
162e43473b7SVivek Goyal #define throtl_log(td, fmt, args...)	\
163e43473b7SVivek Goyal 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
164e43473b7SVivek Goyal 
165d2f31a5fSJoe Perches static inline unsigned int total_nr_queued(struct throtl_data *td)
166e43473b7SVivek Goyal {
167d2f31a5fSJoe Perches 	return td->nr_queued[0] + td->nr_queued[1];
168e43473b7SVivek Goyal }
169e43473b7SVivek Goyal 
1708a3d2615STejun Heo /*
1718a3d2615STejun Heo  * Worker for allocating per cpu stat for tgs. This is scheduled on the
1728a3d2615STejun Heo  * system_nrt_wq once there are some groups on the alloc_list waiting for
1738a3d2615STejun Heo  * allocation.
1748a3d2615STejun Heo  */
1758a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work)
1768a3d2615STejun Heo {
1778a3d2615STejun Heo 	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
1788a3d2615STejun Heo 	struct delayed_work *dwork = to_delayed_work(work);
1798a3d2615STejun Heo 	bool empty = false;
1808a3d2615STejun Heo 
1818a3d2615STejun Heo alloc_stats:
1828a3d2615STejun Heo 	if (!stats_cpu) {
1838a3d2615STejun Heo 		stats_cpu = alloc_percpu(struct tg_stats_cpu);
1848a3d2615STejun Heo 		if (!stats_cpu) {
1858a3d2615STejun Heo 			/* allocation failed, try again after some time */
1868a3d2615STejun Heo 			queue_delayed_work(system_nrt_wq, dwork,
1878a3d2615STejun Heo 					   msecs_to_jiffies(10));
1888a3d2615STejun Heo 			return;
1898a3d2615STejun Heo 		}
1908a3d2615STejun Heo 	}
1918a3d2615STejun Heo 
1928a3d2615STejun Heo 	spin_lock_irq(&tg_stats_alloc_lock);
1938a3d2615STejun Heo 
1948a3d2615STejun Heo 	if (!list_empty(&tg_stats_alloc_list)) {
1958a3d2615STejun Heo 		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
1968a3d2615STejun Heo 							 struct throtl_grp,
1978a3d2615STejun Heo 							 stats_alloc_node);
1988a3d2615STejun Heo 		swap(tg->stats_cpu, stats_cpu);
1998a3d2615STejun Heo 		list_del_init(&tg->stats_alloc_node);
2008a3d2615STejun Heo 	}
2018a3d2615STejun Heo 
2028a3d2615STejun Heo 	empty = list_empty(&tg_stats_alloc_list);
2038a3d2615STejun Heo 	spin_unlock_irq(&tg_stats_alloc_lock);
2048a3d2615STejun Heo 	if (!empty)
2058a3d2615STejun Heo 		goto alloc_stats;
2068a3d2615STejun Heo }
2078a3d2615STejun Heo 
2080381411eSTejun Heo static void throtl_init_blkio_group(struct blkio_group *blkg)
209a29a171eSVivek Goyal {
2100381411eSTejun Heo 	struct throtl_grp *tg = blkg_to_tg(blkg);
211cd1604faSTejun Heo 
212a29a171eSVivek Goyal 	RB_CLEAR_NODE(&tg->rb_node);
213a29a171eSVivek Goyal 	bio_list_init(&tg->bio_lists[0]);
214a29a171eSVivek Goyal 	bio_list_init(&tg->bio_lists[1]);
215a29a171eSVivek Goyal 	tg->limits_changed = false;
216a29a171eSVivek Goyal 
217e56da7e2STejun Heo 	tg->bps[READ] = -1;
218e56da7e2STejun Heo 	tg->bps[WRITE] = -1;
219e56da7e2STejun Heo 	tg->iops[READ] = -1;
220e56da7e2STejun Heo 	tg->iops[WRITE] = -1;
2218a3d2615STejun Heo 
2228a3d2615STejun Heo 	/*
2238a3d2615STejun Heo 	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
2248a3d2615STejun Heo 	 * but percpu allocator can't be called from IO path.  Queue tg on
2258a3d2615STejun Heo 	 * tg_stats_alloc_list and allocate from work item.
2268a3d2615STejun Heo 	 */
2278a3d2615STejun Heo 	spin_lock(&tg_stats_alloc_lock);
2288a3d2615STejun Heo 	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
2298a3d2615STejun Heo 	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
2308a3d2615STejun Heo 	spin_unlock(&tg_stats_alloc_lock);
2318a3d2615STejun Heo }
2328a3d2615STejun Heo 
2338a3d2615STejun Heo static void throtl_exit_blkio_group(struct blkio_group *blkg)
2348a3d2615STejun Heo {
2358a3d2615STejun Heo 	struct throtl_grp *tg = blkg_to_tg(blkg);
2368a3d2615STejun Heo 
2378a3d2615STejun Heo 	spin_lock(&tg_stats_alloc_lock);
2388a3d2615STejun Heo 	list_del_init(&tg->stats_alloc_node);
2398a3d2615STejun Heo 	spin_unlock(&tg_stats_alloc_lock);
2408a3d2615STejun Heo 
2418a3d2615STejun Heo 	free_percpu(tg->stats_cpu);
2428a3d2615STejun Heo }
2438a3d2615STejun Heo 
2448a3d2615STejun Heo static void throtl_reset_group_stats(struct blkio_group *blkg)
2458a3d2615STejun Heo {
2468a3d2615STejun Heo 	struct throtl_grp *tg = blkg_to_tg(blkg);
2478a3d2615STejun Heo 	int cpu;
2488a3d2615STejun Heo 
2498a3d2615STejun Heo 	if (tg->stats_cpu == NULL)
2508a3d2615STejun Heo 		return;
2518a3d2615STejun Heo 
2528a3d2615STejun Heo 	for_each_possible_cpu(cpu) {
2538a3d2615STejun Heo 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
2548a3d2615STejun Heo 
2558a3d2615STejun Heo 		blkg_rwstat_reset(&sc->service_bytes);
2568a3d2615STejun Heo 		blkg_rwstat_reset(&sc->serviced);
2578a3d2615STejun Heo 	}
258a29a171eSVivek Goyal }
259a29a171eSVivek Goyal 
260f469a7b4SVivek Goyal static struct
261cd1604faSTejun Heo throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
262e43473b7SVivek Goyal {
263e43473b7SVivek Goyal 	/*
264be2c6b19SVivek Goyal 	 * This is the common case when there are no blkio cgroups.
265be2c6b19SVivek Goyal 	 * Avoid lookup in this case
266be2c6b19SVivek Goyal 	 */
267be2c6b19SVivek Goyal 	if (blkcg == &blkio_root_cgroup)
26803d8e111STejun Heo 		return td_root_tg(td);
269e43473b7SVivek Goyal 
270e8989faeSTejun Heo 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
271e43473b7SVivek Goyal }
272e43473b7SVivek Goyal 
273cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
2740a5a7d0eSTejun Heo 						  struct blkio_cgroup *blkcg)
275e43473b7SVivek Goyal {
276f469a7b4SVivek Goyal 	struct request_queue *q = td->queue;
277cd1604faSTejun Heo 	struct throtl_grp *tg = NULL;
2780a5a7d0eSTejun Heo 
279f469a7b4SVivek Goyal 	/*
280cd1604faSTejun Heo 	 * This is the common case when there are no blkio cgroups.
281cd1604faSTejun Heo 	 * Avoid lookup in this case
282f469a7b4SVivek Goyal 	 */
283cd1604faSTejun Heo 	if (blkcg == &blkio_root_cgroup) {
28403d8e111STejun Heo 		tg = td_root_tg(td);
285cd1604faSTejun Heo 	} else {
286cd1604faSTejun Heo 		struct blkio_group *blkg;
287cd1604faSTejun Heo 
288aaec55a0STejun Heo 		blkg = blkg_lookup_create(blkcg, q, false);
289cd1604faSTejun Heo 
290cd1604faSTejun Heo 		/* if %NULL and @q is alive, fall back to root_tg */
291cd1604faSTejun Heo 		if (!IS_ERR(blkg))
2920381411eSTejun Heo 			tg = blkg_to_tg(blkg);
293cd1604faSTejun Heo 		else if (!blk_queue_dead(q))
29403d8e111STejun Heo 			tg = td_root_tg(td);
295f469a7b4SVivek Goyal 	}
296f469a7b4SVivek Goyal 
297e43473b7SVivek Goyal 	return tg;
298e43473b7SVivek Goyal }
299e43473b7SVivek Goyal 
300e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
301e43473b7SVivek Goyal {
302e43473b7SVivek Goyal 	/* Service tree is empty */
303e43473b7SVivek Goyal 	if (!root->count)
304e43473b7SVivek Goyal 		return NULL;
305e43473b7SVivek Goyal 
306e43473b7SVivek Goyal 	if (!root->left)
307e43473b7SVivek Goyal 		root->left = rb_first(&root->rb);
308e43473b7SVivek Goyal 
309e43473b7SVivek Goyal 	if (root->left)
310e43473b7SVivek Goyal 		return rb_entry_tg(root->left);
311e43473b7SVivek Goyal 
312e43473b7SVivek Goyal 	return NULL;
313e43473b7SVivek Goyal }
314e43473b7SVivek Goyal 
315e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root)
316e43473b7SVivek Goyal {
317e43473b7SVivek Goyal 	rb_erase(n, root);
318e43473b7SVivek Goyal 	RB_CLEAR_NODE(n);
319e43473b7SVivek Goyal }
320e43473b7SVivek Goyal 
321e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
322e43473b7SVivek Goyal {
323e43473b7SVivek Goyal 	if (root->left == n)
324e43473b7SVivek Goyal 		root->left = NULL;
325e43473b7SVivek Goyal 	rb_erase_init(n, &root->rb);
326e43473b7SVivek Goyal 	--root->count;
327e43473b7SVivek Goyal }
328e43473b7SVivek Goyal 
329e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st)
330e43473b7SVivek Goyal {
331e43473b7SVivek Goyal 	struct throtl_grp *tg;
332e43473b7SVivek Goyal 
333e43473b7SVivek Goyal 	tg = throtl_rb_first(st);
334e43473b7SVivek Goyal 	if (!tg)
335e43473b7SVivek Goyal 		return;
336e43473b7SVivek Goyal 
337e43473b7SVivek Goyal 	st->min_disptime = tg->disptime;
338e43473b7SVivek Goyal }
339e43473b7SVivek Goyal 
340e43473b7SVivek Goyal static void
341e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
342e43473b7SVivek Goyal {
343e43473b7SVivek Goyal 	struct rb_node **node = &st->rb.rb_node;
344e43473b7SVivek Goyal 	struct rb_node *parent = NULL;
345e43473b7SVivek Goyal 	struct throtl_grp *__tg;
346e43473b7SVivek Goyal 	unsigned long key = tg->disptime;
347e43473b7SVivek Goyal 	int left = 1;
348e43473b7SVivek Goyal 
349e43473b7SVivek Goyal 	while (*node != NULL) {
350e43473b7SVivek Goyal 		parent = *node;
351e43473b7SVivek Goyal 		__tg = rb_entry_tg(parent);
352e43473b7SVivek Goyal 
353e43473b7SVivek Goyal 		if (time_before(key, __tg->disptime))
354e43473b7SVivek Goyal 			node = &parent->rb_left;
355e43473b7SVivek Goyal 		else {
356e43473b7SVivek Goyal 			node = &parent->rb_right;
357e43473b7SVivek Goyal 			left = 0;
358e43473b7SVivek Goyal 		}
359e43473b7SVivek Goyal 	}
360e43473b7SVivek Goyal 
361e43473b7SVivek Goyal 	if (left)
362e43473b7SVivek Goyal 		st->left = &tg->rb_node;
363e43473b7SVivek Goyal 
364e43473b7SVivek Goyal 	rb_link_node(&tg->rb_node, parent, node);
365e43473b7SVivek Goyal 	rb_insert_color(&tg->rb_node, &st->rb);
366e43473b7SVivek Goyal }
367e43473b7SVivek Goyal 
368e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
369e43473b7SVivek Goyal {
370e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
371e43473b7SVivek Goyal 
372e43473b7SVivek Goyal 	tg_service_tree_add(st, tg);
373e43473b7SVivek Goyal 	throtl_mark_tg_on_rr(tg);
374e43473b7SVivek Goyal 	st->count++;
375e43473b7SVivek Goyal }
376e43473b7SVivek Goyal 
377e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
378e43473b7SVivek Goyal {
379e43473b7SVivek Goyal 	if (!throtl_tg_on_rr(tg))
380e43473b7SVivek Goyal 		__throtl_enqueue_tg(td, tg);
381e43473b7SVivek Goyal }
382e43473b7SVivek Goyal 
383e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
384e43473b7SVivek Goyal {
385e43473b7SVivek Goyal 	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
386e43473b7SVivek Goyal 	throtl_clear_tg_on_rr(tg);
387e43473b7SVivek Goyal }
388e43473b7SVivek Goyal 
389e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
390e43473b7SVivek Goyal {
391e43473b7SVivek Goyal 	if (throtl_tg_on_rr(tg))
392e43473b7SVivek Goyal 		__throtl_dequeue_tg(td, tg);
393e43473b7SVivek Goyal }
394e43473b7SVivek Goyal 
395e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td)
396e43473b7SVivek Goyal {
397e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
398e43473b7SVivek Goyal 
399e43473b7SVivek Goyal 	/*
400e43473b7SVivek Goyal 	 * If there are more bios pending, schedule more work.
401e43473b7SVivek Goyal 	 */
402e43473b7SVivek Goyal 	if (!total_nr_queued(td))
403e43473b7SVivek Goyal 		return;
404e43473b7SVivek Goyal 
405e43473b7SVivek Goyal 	BUG_ON(!st->count);
406e43473b7SVivek Goyal 
407e43473b7SVivek Goyal 	update_min_dispatch_time(st);
408e43473b7SVivek Goyal 
409e43473b7SVivek Goyal 	if (time_before_eq(st->min_disptime, jiffies))
410450adcbeSVivek Goyal 		throtl_schedule_delayed_work(td, 0);
411e43473b7SVivek Goyal 	else
412450adcbeSVivek Goyal 		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
413e43473b7SVivek Goyal }
414e43473b7SVivek Goyal 
415e43473b7SVivek Goyal static inline void
416e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
417e43473b7SVivek Goyal {
418e43473b7SVivek Goyal 	tg->bytes_disp[rw] = 0;
4198e89d13fSVivek Goyal 	tg->io_disp[rw] = 0;
420e43473b7SVivek Goyal 	tg->slice_start[rw] = jiffies;
421e43473b7SVivek Goyal 	tg->slice_end[rw] = jiffies + throtl_slice;
422e43473b7SVivek Goyal 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
423e43473b7SVivek Goyal 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
424e43473b7SVivek Goyal 			tg->slice_end[rw], jiffies);
425e43473b7SVivek Goyal }
426e43473b7SVivek Goyal 
427d1ae8ffdSVivek Goyal static inline void throtl_set_slice_end(struct throtl_data *td,
428d1ae8ffdSVivek Goyal 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
429d1ae8ffdSVivek Goyal {
430d1ae8ffdSVivek Goyal 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
431d1ae8ffdSVivek Goyal }
432d1ae8ffdSVivek Goyal 
433e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td,
434e43473b7SVivek Goyal 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
435e43473b7SVivek Goyal {
436e43473b7SVivek Goyal 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
437e43473b7SVivek Goyal 	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
438e43473b7SVivek Goyal 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
439e43473b7SVivek Goyal 			tg->slice_end[rw], jiffies);
440e43473b7SVivek Goyal }
441e43473b7SVivek Goyal 
442e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */
443e43473b7SVivek Goyal static bool
444e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
445e43473b7SVivek Goyal {
446e43473b7SVivek Goyal 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
447e43473b7SVivek Goyal 		return 0;
448e43473b7SVivek Goyal 
449e43473b7SVivek Goyal 	return 1;
450e43473b7SVivek Goyal }
451e43473b7SVivek Goyal 
452e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */
453e43473b7SVivek Goyal static inline void
454e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
455e43473b7SVivek Goyal {
4563aad5d3eSVivek Goyal 	unsigned long nr_slices, time_elapsed, io_trim;
4573aad5d3eSVivek Goyal 	u64 bytes_trim, tmp;
458e43473b7SVivek Goyal 
459e43473b7SVivek Goyal 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
460e43473b7SVivek Goyal 
461e43473b7SVivek Goyal 	/*
462e43473b7SVivek Goyal 	 * If bps are unlimited (-1), then time slice don't get
463e43473b7SVivek Goyal 	 * renewed. Don't try to trim the slice if slice is used. A new
464e43473b7SVivek Goyal 	 * slice will start when appropriate.
465e43473b7SVivek Goyal 	 */
466e43473b7SVivek Goyal 	if (throtl_slice_used(td, tg, rw))
467e43473b7SVivek Goyal 		return;
468e43473b7SVivek Goyal 
469d1ae8ffdSVivek Goyal 	/*
470d1ae8ffdSVivek Goyal 	 * A bio has been dispatched. Also adjust slice_end. It might happen
471d1ae8ffdSVivek Goyal 	 * that initially cgroup limit was very low resulting in high
472d1ae8ffdSVivek Goyal 	 * slice_end, but later limit was bumped up and bio was dispached
473d1ae8ffdSVivek Goyal 	 * sooner, then we need to reduce slice_end. A high bogus slice_end
474d1ae8ffdSVivek Goyal 	 * is bad because it does not allow new slice to start.
475d1ae8ffdSVivek Goyal 	 */
476d1ae8ffdSVivek Goyal 
477d1ae8ffdSVivek Goyal 	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
478d1ae8ffdSVivek Goyal 
479e43473b7SVivek Goyal 	time_elapsed = jiffies - tg->slice_start[rw];
480e43473b7SVivek Goyal 
481e43473b7SVivek Goyal 	nr_slices = time_elapsed / throtl_slice;
482e43473b7SVivek Goyal 
483e43473b7SVivek Goyal 	if (!nr_slices)
484e43473b7SVivek Goyal 		return;
4853aad5d3eSVivek Goyal 	tmp = tg->bps[rw] * throtl_slice * nr_slices;
4863aad5d3eSVivek Goyal 	do_div(tmp, HZ);
4873aad5d3eSVivek Goyal 	bytes_trim = tmp;
488e43473b7SVivek Goyal 
4898e89d13fSVivek Goyal 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
490e43473b7SVivek Goyal 
4918e89d13fSVivek Goyal 	if (!bytes_trim && !io_trim)
492e43473b7SVivek Goyal 		return;
493e43473b7SVivek Goyal 
494e43473b7SVivek Goyal 	if (tg->bytes_disp[rw] >= bytes_trim)
495e43473b7SVivek Goyal 		tg->bytes_disp[rw] -= bytes_trim;
496e43473b7SVivek Goyal 	else
497e43473b7SVivek Goyal 		tg->bytes_disp[rw] = 0;
498e43473b7SVivek Goyal 
4998e89d13fSVivek Goyal 	if (tg->io_disp[rw] >= io_trim)
5008e89d13fSVivek Goyal 		tg->io_disp[rw] -= io_trim;
5018e89d13fSVivek Goyal 	else
5028e89d13fSVivek Goyal 		tg->io_disp[rw] = 0;
5038e89d13fSVivek Goyal 
504e43473b7SVivek Goyal 	tg->slice_start[rw] += nr_slices * throtl_slice;
505e43473b7SVivek Goyal 
5063aad5d3eSVivek Goyal 	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
507e43473b7SVivek Goyal 			" start=%lu end=%lu jiffies=%lu",
5088e89d13fSVivek Goyal 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
509e43473b7SVivek Goyal 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
510e43473b7SVivek Goyal }
511e43473b7SVivek Goyal 
5128e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
513e43473b7SVivek Goyal 		struct bio *bio, unsigned long *wait)
514e43473b7SVivek Goyal {
515e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
5168e89d13fSVivek Goyal 	unsigned int io_allowed;
517e43473b7SVivek Goyal 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
518c49c06e4SVivek Goyal 	u64 tmp;
519e43473b7SVivek Goyal 
5208e89d13fSVivek Goyal 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
521e43473b7SVivek Goyal 
5228e89d13fSVivek Goyal 	/* Slice has just started. Consider one slice interval */
5238e89d13fSVivek Goyal 	if (!jiffy_elapsed)
5248e89d13fSVivek Goyal 		jiffy_elapsed_rnd = throtl_slice;
5258e89d13fSVivek Goyal 
5268e89d13fSVivek Goyal 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
5278e89d13fSVivek Goyal 
528c49c06e4SVivek Goyal 	/*
529c49c06e4SVivek Goyal 	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
530c49c06e4SVivek Goyal 	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
531c49c06e4SVivek Goyal 	 * will allow dispatch after 1 second and after that slice should
532c49c06e4SVivek Goyal 	 * have been trimmed.
533c49c06e4SVivek Goyal 	 */
534c49c06e4SVivek Goyal 
535c49c06e4SVivek Goyal 	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
536c49c06e4SVivek Goyal 	do_div(tmp, HZ);
537c49c06e4SVivek Goyal 
538c49c06e4SVivek Goyal 	if (tmp > UINT_MAX)
539c49c06e4SVivek Goyal 		io_allowed = UINT_MAX;
540c49c06e4SVivek Goyal 	else
541c49c06e4SVivek Goyal 		io_allowed = tmp;
5428e89d13fSVivek Goyal 
5438e89d13fSVivek Goyal 	if (tg->io_disp[rw] + 1 <= io_allowed) {
544e43473b7SVivek Goyal 		if (wait)
545e43473b7SVivek Goyal 			*wait = 0;
546e43473b7SVivek Goyal 		return 1;
547e43473b7SVivek Goyal 	}
548e43473b7SVivek Goyal 
5498e89d13fSVivek Goyal 	/* Calc approx time to dispatch */
5508e89d13fSVivek Goyal 	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
5518e89d13fSVivek Goyal 
5528e89d13fSVivek Goyal 	if (jiffy_wait > jiffy_elapsed)
5538e89d13fSVivek Goyal 		jiffy_wait = jiffy_wait - jiffy_elapsed;
5548e89d13fSVivek Goyal 	else
5558e89d13fSVivek Goyal 		jiffy_wait = 1;
5568e89d13fSVivek Goyal 
5578e89d13fSVivek Goyal 	if (wait)
5588e89d13fSVivek Goyal 		*wait = jiffy_wait;
5598e89d13fSVivek Goyal 	return 0;
560e43473b7SVivek Goyal }
561e43473b7SVivek Goyal 
5628e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
5638e89d13fSVivek Goyal 		struct bio *bio, unsigned long *wait)
5648e89d13fSVivek Goyal {
5658e89d13fSVivek Goyal 	bool rw = bio_data_dir(bio);
5663aad5d3eSVivek Goyal 	u64 bytes_allowed, extra_bytes, tmp;
5678e89d13fSVivek Goyal 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
5688e89d13fSVivek Goyal 
569e43473b7SVivek Goyal 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
570e43473b7SVivek Goyal 
571e43473b7SVivek Goyal 	/* Slice has just started. Consider one slice interval */
572e43473b7SVivek Goyal 	if (!jiffy_elapsed)
573e43473b7SVivek Goyal 		jiffy_elapsed_rnd = throtl_slice;
574e43473b7SVivek Goyal 
575e43473b7SVivek Goyal 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
576e43473b7SVivek Goyal 
5775e901a2bSVivek Goyal 	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
5785e901a2bSVivek Goyal 	do_div(tmp, HZ);
5793aad5d3eSVivek Goyal 	bytes_allowed = tmp;
580e43473b7SVivek Goyal 
581e43473b7SVivek Goyal 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
582e43473b7SVivek Goyal 		if (wait)
583e43473b7SVivek Goyal 			*wait = 0;
584e43473b7SVivek Goyal 		return 1;
585e43473b7SVivek Goyal 	}
586e43473b7SVivek Goyal 
587e43473b7SVivek Goyal 	/* Calc approx time to dispatch */
588e43473b7SVivek Goyal 	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
589e43473b7SVivek Goyal 	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
590e43473b7SVivek Goyal 
591e43473b7SVivek Goyal 	if (!jiffy_wait)
592e43473b7SVivek Goyal 		jiffy_wait = 1;
593e43473b7SVivek Goyal 
594e43473b7SVivek Goyal 	/*
595e43473b7SVivek Goyal 	 * This wait time is without taking into consideration the rounding
596e43473b7SVivek Goyal 	 * up we did. Add that time also.
597e43473b7SVivek Goyal 	 */
598e43473b7SVivek Goyal 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
599e43473b7SVivek Goyal 	if (wait)
600e43473b7SVivek Goyal 		*wait = jiffy_wait;
6018e89d13fSVivek Goyal 	return 0;
6028e89d13fSVivek Goyal }
603e43473b7SVivek Goyal 
604af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
605af75cd3cSVivek Goyal 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
606af75cd3cSVivek Goyal 		return 1;
607af75cd3cSVivek Goyal 	return 0;
608af75cd3cSVivek Goyal }
609af75cd3cSVivek Goyal 
6108e89d13fSVivek Goyal /*
6118e89d13fSVivek Goyal  * Returns whether one can dispatch a bio or not. Also returns approx number
6128e89d13fSVivek Goyal  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
6138e89d13fSVivek Goyal  */
6148e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
6158e89d13fSVivek Goyal 				struct bio *bio, unsigned long *wait)
6168e89d13fSVivek Goyal {
6178e89d13fSVivek Goyal 	bool rw = bio_data_dir(bio);
6188e89d13fSVivek Goyal 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
6198e89d13fSVivek Goyal 
6208e89d13fSVivek Goyal 	/*
6218e89d13fSVivek Goyal  	 * Currently whole state machine of group depends on first bio
6228e89d13fSVivek Goyal 	 * queued in the group bio list. So one should not be calling
6238e89d13fSVivek Goyal 	 * this function with a different bio if there are other bios
6248e89d13fSVivek Goyal 	 * queued.
6258e89d13fSVivek Goyal 	 */
6268e89d13fSVivek Goyal 	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
6278e89d13fSVivek Goyal 
6288e89d13fSVivek Goyal 	/* If tg->bps = -1, then BW is unlimited */
6298e89d13fSVivek Goyal 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
6308e89d13fSVivek Goyal 		if (wait)
6318e89d13fSVivek Goyal 			*wait = 0;
6328e89d13fSVivek Goyal 		return 1;
6338e89d13fSVivek Goyal 	}
6348e89d13fSVivek Goyal 
6358e89d13fSVivek Goyal 	/*
6368e89d13fSVivek Goyal 	 * If previous slice expired, start a new one otherwise renew/extend
6378e89d13fSVivek Goyal 	 * existing slice to make sure it is at least throtl_slice interval
6388e89d13fSVivek Goyal 	 * long since now.
6398e89d13fSVivek Goyal 	 */
6408e89d13fSVivek Goyal 	if (throtl_slice_used(td, tg, rw))
6418e89d13fSVivek Goyal 		throtl_start_new_slice(td, tg, rw);
6428e89d13fSVivek Goyal 	else {
6438e89d13fSVivek Goyal 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
6448e89d13fSVivek Goyal 			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
6458e89d13fSVivek Goyal 	}
6468e89d13fSVivek Goyal 
6478e89d13fSVivek Goyal 	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
6488e89d13fSVivek Goyal 	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
6498e89d13fSVivek Goyal 		if (wait)
6508e89d13fSVivek Goyal 			*wait = 0;
6518e89d13fSVivek Goyal 		return 1;
6528e89d13fSVivek Goyal 	}
6538e89d13fSVivek Goyal 
6548e89d13fSVivek Goyal 	max_wait = max(bps_wait, iops_wait);
6558e89d13fSVivek Goyal 
6568e89d13fSVivek Goyal 	if (wait)
6578e89d13fSVivek Goyal 		*wait = max_wait;
6588e89d13fSVivek Goyal 
6598e89d13fSVivek Goyal 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
6608e89d13fSVivek Goyal 		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
661e43473b7SVivek Goyal 
662e43473b7SVivek Goyal 	return 0;
663e43473b7SVivek Goyal }
664e43473b7SVivek Goyal 
665629ed0b1STejun Heo static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
666629ed0b1STejun Heo 					 int rw)
667629ed0b1STejun Heo {
6688a3d2615STejun Heo 	struct throtl_grp *tg = blkg_to_tg(blkg);
6698a3d2615STejun Heo 	struct tg_stats_cpu *stats_cpu;
670629ed0b1STejun Heo 	unsigned long flags;
671629ed0b1STejun Heo 
672629ed0b1STejun Heo 	/* If per cpu stats are not allocated yet, don't do any accounting. */
6738a3d2615STejun Heo 	if (tg->stats_cpu == NULL)
674629ed0b1STejun Heo 		return;
675629ed0b1STejun Heo 
676629ed0b1STejun Heo 	/*
677629ed0b1STejun Heo 	 * Disabling interrupts to provide mutual exclusion between two
678629ed0b1STejun Heo 	 * writes on same cpu. It probably is not needed for 64bit. Not
679629ed0b1STejun Heo 	 * optimizing that case yet.
680629ed0b1STejun Heo 	 */
681629ed0b1STejun Heo 	local_irq_save(flags);
682629ed0b1STejun Heo 
6838a3d2615STejun Heo 	stats_cpu = this_cpu_ptr(tg->stats_cpu);
684629ed0b1STejun Heo 
685629ed0b1STejun Heo 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
686629ed0b1STejun Heo 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
687629ed0b1STejun Heo 
688629ed0b1STejun Heo 	local_irq_restore(flags);
689629ed0b1STejun Heo }
690629ed0b1STejun Heo 
691e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
692e43473b7SVivek Goyal {
693e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
694e43473b7SVivek Goyal 
695e43473b7SVivek Goyal 	/* Charge the bio to the group */
696e43473b7SVivek Goyal 	tg->bytes_disp[rw] += bio->bi_size;
6978e89d13fSVivek Goyal 	tg->io_disp[rw]++;
698e43473b7SVivek Goyal 
699629ed0b1STejun Heo 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
700e43473b7SVivek Goyal }
701e43473b7SVivek Goyal 
702e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
703e43473b7SVivek Goyal 			struct bio *bio)
704e43473b7SVivek Goyal {
705e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
706e43473b7SVivek Goyal 
707e43473b7SVivek Goyal 	bio_list_add(&tg->bio_lists[rw], bio);
708e43473b7SVivek Goyal 	/* Take a bio reference on tg */
7091adaf3ddSTejun Heo 	blkg_get(tg_to_blkg(tg));
710e43473b7SVivek Goyal 	tg->nr_queued[rw]++;
711e43473b7SVivek Goyal 	td->nr_queued[rw]++;
712e43473b7SVivek Goyal 	throtl_enqueue_tg(td, tg);
713e43473b7SVivek Goyal }
714e43473b7SVivek Goyal 
715e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
716e43473b7SVivek Goyal {
717e43473b7SVivek Goyal 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
718e43473b7SVivek Goyal 	struct bio *bio;
719e43473b7SVivek Goyal 
720e43473b7SVivek Goyal 	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
721e43473b7SVivek Goyal 		tg_may_dispatch(td, tg, bio, &read_wait);
722e43473b7SVivek Goyal 
723e43473b7SVivek Goyal 	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
724e43473b7SVivek Goyal 		tg_may_dispatch(td, tg, bio, &write_wait);
725e43473b7SVivek Goyal 
726e43473b7SVivek Goyal 	min_wait = min(read_wait, write_wait);
727e43473b7SVivek Goyal 	disptime = jiffies + min_wait;
728e43473b7SVivek Goyal 
729e43473b7SVivek Goyal 	/* Update dispatch time */
730e43473b7SVivek Goyal 	throtl_dequeue_tg(td, tg);
731e43473b7SVivek Goyal 	tg->disptime = disptime;
732e43473b7SVivek Goyal 	throtl_enqueue_tg(td, tg);
733e43473b7SVivek Goyal }
734e43473b7SVivek Goyal 
735e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
736e43473b7SVivek Goyal 				bool rw, struct bio_list *bl)
737e43473b7SVivek Goyal {
738e43473b7SVivek Goyal 	struct bio *bio;
739e43473b7SVivek Goyal 
740e43473b7SVivek Goyal 	bio = bio_list_pop(&tg->bio_lists[rw]);
741e43473b7SVivek Goyal 	tg->nr_queued[rw]--;
7421adaf3ddSTejun Heo 	/* Drop bio reference on blkg */
7431adaf3ddSTejun Heo 	blkg_put(tg_to_blkg(tg));
744e43473b7SVivek Goyal 
745e43473b7SVivek Goyal 	BUG_ON(td->nr_queued[rw] <= 0);
746e43473b7SVivek Goyal 	td->nr_queued[rw]--;
747e43473b7SVivek Goyal 
748e43473b7SVivek Goyal 	throtl_charge_bio(tg, bio);
749e43473b7SVivek Goyal 	bio_list_add(bl, bio);
750e43473b7SVivek Goyal 	bio->bi_rw |= REQ_THROTTLED;
751e43473b7SVivek Goyal 
752e43473b7SVivek Goyal 	throtl_trim_slice(td, tg, rw);
753e43473b7SVivek Goyal }
754e43473b7SVivek Goyal 
755e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
756e43473b7SVivek Goyal 				struct bio_list *bl)
757e43473b7SVivek Goyal {
758e43473b7SVivek Goyal 	unsigned int nr_reads = 0, nr_writes = 0;
759e43473b7SVivek Goyal 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
760c2f6805dSVivek Goyal 	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
761e43473b7SVivek Goyal 	struct bio *bio;
762e43473b7SVivek Goyal 
763e43473b7SVivek Goyal 	/* Try to dispatch 75% READS and 25% WRITES */
764e43473b7SVivek Goyal 
765e43473b7SVivek Goyal 	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
766e43473b7SVivek Goyal 		&& tg_may_dispatch(td, tg, bio, NULL)) {
767e43473b7SVivek Goyal 
768e43473b7SVivek Goyal 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
769e43473b7SVivek Goyal 		nr_reads++;
770e43473b7SVivek Goyal 
771e43473b7SVivek Goyal 		if (nr_reads >= max_nr_reads)
772e43473b7SVivek Goyal 			break;
773e43473b7SVivek Goyal 	}
774e43473b7SVivek Goyal 
775e43473b7SVivek Goyal 	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
776e43473b7SVivek Goyal 		&& tg_may_dispatch(td, tg, bio, NULL)) {
777e43473b7SVivek Goyal 
778e43473b7SVivek Goyal 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
779e43473b7SVivek Goyal 		nr_writes++;
780e43473b7SVivek Goyal 
781e43473b7SVivek Goyal 		if (nr_writes >= max_nr_writes)
782e43473b7SVivek Goyal 			break;
783e43473b7SVivek Goyal 	}
784e43473b7SVivek Goyal 
785e43473b7SVivek Goyal 	return nr_reads + nr_writes;
786e43473b7SVivek Goyal }
787e43473b7SVivek Goyal 
788e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
789e43473b7SVivek Goyal {
790e43473b7SVivek Goyal 	unsigned int nr_disp = 0;
791e43473b7SVivek Goyal 	struct throtl_grp *tg;
792e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
793e43473b7SVivek Goyal 
794e43473b7SVivek Goyal 	while (1) {
795e43473b7SVivek Goyal 		tg = throtl_rb_first(st);
796e43473b7SVivek Goyal 
797e43473b7SVivek Goyal 		if (!tg)
798e43473b7SVivek Goyal 			break;
799e43473b7SVivek Goyal 
800e43473b7SVivek Goyal 		if (time_before(jiffies, tg->disptime))
801e43473b7SVivek Goyal 			break;
802e43473b7SVivek Goyal 
803e43473b7SVivek Goyal 		throtl_dequeue_tg(td, tg);
804e43473b7SVivek Goyal 
805e43473b7SVivek Goyal 		nr_disp += throtl_dispatch_tg(td, tg, bl);
806e43473b7SVivek Goyal 
807e43473b7SVivek Goyal 		if (tg->nr_queued[0] || tg->nr_queued[1]) {
808e43473b7SVivek Goyal 			tg_update_disptime(td, tg);
809e43473b7SVivek Goyal 			throtl_enqueue_tg(td, tg);
810e43473b7SVivek Goyal 		}
811e43473b7SVivek Goyal 
812e43473b7SVivek Goyal 		if (nr_disp >= throtl_quantum)
813e43473b7SVivek Goyal 			break;
814e43473b7SVivek Goyal 	}
815e43473b7SVivek Goyal 
816e43473b7SVivek Goyal 	return nr_disp;
817e43473b7SVivek Goyal }
818e43473b7SVivek Goyal 
819fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td)
820fe071437SVivek Goyal {
8214eef3049STejun Heo 	struct request_queue *q = td->queue;
8224eef3049STejun Heo 	struct blkio_group *blkg, *n;
823fe071437SVivek Goyal 
824de701c74SVivek Goyal 	if (!td->limits_changed)
825fe071437SVivek Goyal 		return;
826fe071437SVivek Goyal 
827de701c74SVivek Goyal 	xchg(&td->limits_changed, false);
828fe071437SVivek Goyal 
829de701c74SVivek Goyal 	throtl_log(td, "limits changed");
830fe071437SVivek Goyal 
831e8989faeSTejun Heo 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
8324eef3049STejun Heo 		struct throtl_grp *tg = blkg_to_tg(blkg);
8334eef3049STejun Heo 
834de701c74SVivek Goyal 		if (!tg->limits_changed)
835de701c74SVivek Goyal 			continue;
836fe071437SVivek Goyal 
837de701c74SVivek Goyal 		if (!xchg(&tg->limits_changed, false))
838de701c74SVivek Goyal 			continue;
839de701c74SVivek Goyal 
840de701c74SVivek Goyal 		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
841de701c74SVivek Goyal 			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
842de701c74SVivek Goyal 			tg->iops[READ], tg->iops[WRITE]);
843de701c74SVivek Goyal 
84404521db0SVivek Goyal 		/*
84504521db0SVivek Goyal 		 * Restart the slices for both READ and WRITES. It
84604521db0SVivek Goyal 		 * might happen that a group's limit are dropped
84704521db0SVivek Goyal 		 * suddenly and we don't want to account recently
84804521db0SVivek Goyal 		 * dispatched IO with new low rate
84904521db0SVivek Goyal 		 */
85004521db0SVivek Goyal 		throtl_start_new_slice(td, tg, 0);
85104521db0SVivek Goyal 		throtl_start_new_slice(td, tg, 1);
85204521db0SVivek Goyal 
853de701c74SVivek Goyal 		if (throtl_tg_on_rr(tg))
854de701c74SVivek Goyal 			tg_update_disptime(td, tg);
855de701c74SVivek Goyal 	}
856fe071437SVivek Goyal }
857fe071437SVivek Goyal 
858e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */
859e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q)
860e43473b7SVivek Goyal {
861e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
862e43473b7SVivek Goyal 	unsigned int nr_disp = 0;
863e43473b7SVivek Goyal 	struct bio_list bio_list_on_stack;
864e43473b7SVivek Goyal 	struct bio *bio;
86569d60eb9SVivek Goyal 	struct blk_plug plug;
866e43473b7SVivek Goyal 
867e43473b7SVivek Goyal 	spin_lock_irq(q->queue_lock);
868e43473b7SVivek Goyal 
869fe071437SVivek Goyal 	throtl_process_limit_change(td);
870fe071437SVivek Goyal 
871e43473b7SVivek Goyal 	if (!total_nr_queued(td))
872e43473b7SVivek Goyal 		goto out;
873e43473b7SVivek Goyal 
874e43473b7SVivek Goyal 	bio_list_init(&bio_list_on_stack);
875e43473b7SVivek Goyal 
876d2f31a5fSJoe Perches 	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
877e43473b7SVivek Goyal 			total_nr_queued(td), td->nr_queued[READ],
878e43473b7SVivek Goyal 			td->nr_queued[WRITE]);
879e43473b7SVivek Goyal 
880e43473b7SVivek Goyal 	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
881e43473b7SVivek Goyal 
882e43473b7SVivek Goyal 	if (nr_disp)
883e43473b7SVivek Goyal 		throtl_log(td, "bios disp=%u", nr_disp);
884e43473b7SVivek Goyal 
885e43473b7SVivek Goyal 	throtl_schedule_next_dispatch(td);
886e43473b7SVivek Goyal out:
887e43473b7SVivek Goyal 	spin_unlock_irq(q->queue_lock);
888e43473b7SVivek Goyal 
889e43473b7SVivek Goyal 	/*
890e43473b7SVivek Goyal 	 * If we dispatched some requests, unplug the queue to make sure
891e43473b7SVivek Goyal 	 * immediate dispatch
892e43473b7SVivek Goyal 	 */
893e43473b7SVivek Goyal 	if (nr_disp) {
89469d60eb9SVivek Goyal 		blk_start_plug(&plug);
895e43473b7SVivek Goyal 		while((bio = bio_list_pop(&bio_list_on_stack)))
896e43473b7SVivek Goyal 			generic_make_request(bio);
89769d60eb9SVivek Goyal 		blk_finish_plug(&plug);
898e43473b7SVivek Goyal 	}
899e43473b7SVivek Goyal 	return nr_disp;
900e43473b7SVivek Goyal }
901e43473b7SVivek Goyal 
902e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work)
903e43473b7SVivek Goyal {
904e43473b7SVivek Goyal 	struct throtl_data *td = container_of(work, struct throtl_data,
905e43473b7SVivek Goyal 					throtl_work.work);
906e43473b7SVivek Goyal 	struct request_queue *q = td->queue;
907e43473b7SVivek Goyal 
908e43473b7SVivek Goyal 	throtl_dispatch(q);
909e43473b7SVivek Goyal }
910e43473b7SVivek Goyal 
911e43473b7SVivek Goyal /* Call with queue lock held */
912450adcbeSVivek Goyal static void
913450adcbeSVivek Goyal throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
914e43473b7SVivek Goyal {
915e43473b7SVivek Goyal 
916e43473b7SVivek Goyal 	struct delayed_work *dwork = &td->throtl_work;
917e43473b7SVivek Goyal 
91804521db0SVivek Goyal 	/* schedule work if limits changed even if no bio is queued */
919d2f31a5fSJoe Perches 	if (total_nr_queued(td) || td->limits_changed) {
920e43473b7SVivek Goyal 		/*
921e43473b7SVivek Goyal 		 * We might have a work scheduled to be executed in future.
922e43473b7SVivek Goyal 		 * Cancel that and schedule a new one.
923e43473b7SVivek Goyal 		 */
924e43473b7SVivek Goyal 		__cancel_delayed_work(dwork);
925450adcbeSVivek Goyal 		queue_delayed_work(kthrotld_workqueue, dwork, delay);
926e43473b7SVivek Goyal 		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
927e43473b7SVivek Goyal 				delay, jiffies);
928e43473b7SVivek Goyal 	}
929e43473b7SVivek Goyal }
930e43473b7SVivek Goyal 
931d366e7ecSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
93241b38b6dSTejun Heo {
933d366e7ecSTejun Heo 	struct throtl_grp *tg = pdata;
93441b38b6dSTejun Heo 	struct blkg_rwstat rwstat = { }, tmp;
93541b38b6dSTejun Heo 	int i, cpu;
93641b38b6dSTejun Heo 
93741b38b6dSTejun Heo 	for_each_possible_cpu(cpu) {
9388a3d2615STejun Heo 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
93941b38b6dSTejun Heo 
94041b38b6dSTejun Heo 		tmp = blkg_rwstat_read((void *)sc + off);
94141b38b6dSTejun Heo 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
94241b38b6dSTejun Heo 			rwstat.cnt[i] += tmp.cnt[i];
94341b38b6dSTejun Heo 	}
94441b38b6dSTejun Heo 
945d366e7ecSTejun Heo 	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
94641b38b6dSTejun Heo }
94741b38b6dSTejun Heo 
9488a3d2615STejun Heo static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
94941b38b6dSTejun Heo 			       struct seq_file *sf)
95041b38b6dSTejun Heo {
95141b38b6dSTejun Heo 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
95241b38b6dSTejun Heo 
953ec399347STejun Heo 	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkio_policy_throtl,
9545bc4afb1STejun Heo 			  cft->private, true);
95541b38b6dSTejun Heo 	return 0;
95641b38b6dSTejun Heo }
95741b38b6dSTejun Heo 
958d366e7ecSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off)
95960c2bc2dSTejun Heo {
960d366e7ecSTejun Heo 	u64 v = *(u64 *)(pdata + off);
96160c2bc2dSTejun Heo 
962af133cebSTejun Heo 	if (v == -1)
96360c2bc2dSTejun Heo 		return 0;
964d366e7ecSTejun Heo 	return __blkg_prfill_u64(sf, pdata, v);
96560c2bc2dSTejun Heo }
96660c2bc2dSTejun Heo 
967d366e7ecSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off)
968af133cebSTejun Heo {
969d366e7ecSTejun Heo 	unsigned int v = *(unsigned int *)(pdata + off);
970af133cebSTejun Heo 
971af133cebSTejun Heo 	if (v == -1)
972af133cebSTejun Heo 		return 0;
973d366e7ecSTejun Heo 	return __blkg_prfill_u64(sf, pdata, v);
974af133cebSTejun Heo }
975af133cebSTejun Heo 
976af133cebSTejun Heo static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
97760c2bc2dSTejun Heo 			     struct seq_file *sf)
97860c2bc2dSTejun Heo {
979af133cebSTejun Heo 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64,
980ec399347STejun Heo 			  &blkio_policy_throtl, cft->private, false);
98160c2bc2dSTejun Heo 	return 0;
98260c2bc2dSTejun Heo }
98360c2bc2dSTejun Heo 
984af133cebSTejun Heo static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
985af133cebSTejun Heo 			      struct seq_file *sf)
986e43473b7SVivek Goyal {
987af133cebSTejun Heo 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint,
988ec399347STejun Heo 			  &blkio_policy_throtl, cft->private, false);
989af133cebSTejun Heo 	return 0;
990e43473b7SVivek Goyal }
991e43473b7SVivek Goyal 
992af133cebSTejun Heo static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
993af133cebSTejun Heo 		       bool is_u64)
99460c2bc2dSTejun Heo {
99560c2bc2dSTejun Heo 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
99660c2bc2dSTejun Heo 	struct blkg_conf_ctx ctx;
997af133cebSTejun Heo 	struct throtl_grp *tg;
998*a2b1693bSTejun Heo 	struct throtl_data *td;
99960c2bc2dSTejun Heo 	int ret;
100060c2bc2dSTejun Heo 
1001da8b0662STejun Heo 	ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx);
100260c2bc2dSTejun Heo 	if (ret)
100360c2bc2dSTejun Heo 		return ret;
100460c2bc2dSTejun Heo 
1005af133cebSTejun Heo 	tg = blkg_to_tg(ctx.blkg);
1006*a2b1693bSTejun Heo 	td = ctx.blkg->q->td;
1007af133cebSTejun Heo 
1008af133cebSTejun Heo 	if (!ctx.v)
1009af133cebSTejun Heo 		ctx.v = -1;
1010af133cebSTejun Heo 
1011af133cebSTejun Heo 	if (is_u64)
1012af133cebSTejun Heo 		*(u64 *)((void *)tg + cft->private) = ctx.v;
1013af133cebSTejun Heo 	else
1014af133cebSTejun Heo 		*(unsigned int *)((void *)tg + cft->private) = ctx.v;
1015af133cebSTejun Heo 
1016af133cebSTejun Heo 	/* XXX: we don't need the following deferred processing */
1017af133cebSTejun Heo 	xchg(&tg->limits_changed, true);
1018af133cebSTejun Heo 	xchg(&td->limits_changed, true);
1019af133cebSTejun Heo 	throtl_schedule_delayed_work(td, 0);
1020af133cebSTejun Heo 
102160c2bc2dSTejun Heo 	blkg_conf_finish(&ctx);
1022*a2b1693bSTejun Heo 	return 0;
102360c2bc2dSTejun Heo }
102460c2bc2dSTejun Heo 
1025af133cebSTejun Heo static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
102660c2bc2dSTejun Heo 			   const char *buf)
102760c2bc2dSTejun Heo {
1028af133cebSTejun Heo 	return tg_set_conf(cgrp, cft, buf, true);
102960c2bc2dSTejun Heo }
103060c2bc2dSTejun Heo 
1031af133cebSTejun Heo static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
103260c2bc2dSTejun Heo 			    const char *buf)
103360c2bc2dSTejun Heo {
1034af133cebSTejun Heo 	return tg_set_conf(cgrp, cft, buf, false);
103560c2bc2dSTejun Heo }
103660c2bc2dSTejun Heo 
103760c2bc2dSTejun Heo static struct cftype throtl_files[] = {
103860c2bc2dSTejun Heo 	{
103960c2bc2dSTejun Heo 		.name = "throttle.read_bps_device",
1040af133cebSTejun Heo 		.private = offsetof(struct throtl_grp, bps[READ]),
1041af133cebSTejun Heo 		.read_seq_string = tg_print_conf_u64,
1042af133cebSTejun Heo 		.write_string = tg_set_conf_u64,
104360c2bc2dSTejun Heo 		.max_write_len = 256,
104460c2bc2dSTejun Heo 	},
104560c2bc2dSTejun Heo 	{
104660c2bc2dSTejun Heo 		.name = "throttle.write_bps_device",
1047af133cebSTejun Heo 		.private = offsetof(struct throtl_grp, bps[WRITE]),
1048af133cebSTejun Heo 		.read_seq_string = tg_print_conf_u64,
1049af133cebSTejun Heo 		.write_string = tg_set_conf_u64,
105060c2bc2dSTejun Heo 		.max_write_len = 256,
105160c2bc2dSTejun Heo 	},
105260c2bc2dSTejun Heo 	{
105360c2bc2dSTejun Heo 		.name = "throttle.read_iops_device",
1054af133cebSTejun Heo 		.private = offsetof(struct throtl_grp, iops[READ]),
1055af133cebSTejun Heo 		.read_seq_string = tg_print_conf_uint,
1056af133cebSTejun Heo 		.write_string = tg_set_conf_uint,
105760c2bc2dSTejun Heo 		.max_write_len = 256,
105860c2bc2dSTejun Heo 	},
105960c2bc2dSTejun Heo 	{
106060c2bc2dSTejun Heo 		.name = "throttle.write_iops_device",
1061af133cebSTejun Heo 		.private = offsetof(struct throtl_grp, iops[WRITE]),
1062af133cebSTejun Heo 		.read_seq_string = tg_print_conf_uint,
1063af133cebSTejun Heo 		.write_string = tg_set_conf_uint,
106460c2bc2dSTejun Heo 		.max_write_len = 256,
106560c2bc2dSTejun Heo 	},
106660c2bc2dSTejun Heo 	{
106760c2bc2dSTejun Heo 		.name = "throttle.io_service_bytes",
10685bc4afb1STejun Heo 		.private = offsetof(struct tg_stats_cpu, service_bytes),
10698a3d2615STejun Heo 		.read_seq_string = tg_print_cpu_rwstat,
107060c2bc2dSTejun Heo 	},
107160c2bc2dSTejun Heo 	{
107260c2bc2dSTejun Heo 		.name = "throttle.io_serviced",
10735bc4afb1STejun Heo 		.private = offsetof(struct tg_stats_cpu, serviced),
10748a3d2615STejun Heo 		.read_seq_string = tg_print_cpu_rwstat,
107560c2bc2dSTejun Heo 	},
107660c2bc2dSTejun Heo 	{ }	/* terminate */
107760c2bc2dSTejun Heo };
107860c2bc2dSTejun Heo 
1079da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q)
1080e43473b7SVivek Goyal {
1081e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
1082e43473b7SVivek Goyal 
1083e43473b7SVivek Goyal 	cancel_delayed_work_sync(&td->throtl_work);
1084e43473b7SVivek Goyal }
1085e43473b7SVivek Goyal 
1086e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = {
1087e43473b7SVivek Goyal 	.ops = {
10880381411eSTejun Heo 		.blkio_init_group_fn = throtl_init_blkio_group,
10898a3d2615STejun Heo 		.blkio_exit_group_fn = throtl_exit_blkio_group,
10908a3d2615STejun Heo 		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
1091e43473b7SVivek Goyal 	},
10920381411eSTejun Heo 	.pdata_size = sizeof(struct throtl_grp),
109360c2bc2dSTejun Heo 	.cftypes = throtl_files,
1094e43473b7SVivek Goyal };
1095e43473b7SVivek Goyal 
1096bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
1097e43473b7SVivek Goyal {
1098e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
1099e43473b7SVivek Goyal 	struct throtl_grp *tg;
1100e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio), update_disptime = true;
1101af75cd3cSVivek Goyal 	struct blkio_cgroup *blkcg;
1102bc16a4f9STejun Heo 	bool throttled = false;
1103e43473b7SVivek Goyal 
1104e43473b7SVivek Goyal 	if (bio->bi_rw & REQ_THROTTLED) {
1105e43473b7SVivek Goyal 		bio->bi_rw &= ~REQ_THROTTLED;
1106bc16a4f9STejun Heo 		goto out;
1107e43473b7SVivek Goyal 	}
1108e43473b7SVivek Goyal 
1109671058fbSTejun Heo 	/* bio_associate_current() needs ioc, try creating */
1110671058fbSTejun Heo 	create_io_context(GFP_ATOMIC, q->node);
1111671058fbSTejun Heo 
1112af75cd3cSVivek Goyal 	/*
1113af75cd3cSVivek Goyal 	 * A throtl_grp pointer retrieved under rcu can be used to access
1114af75cd3cSVivek Goyal 	 * basic fields like stats and io rates. If a group has no rules,
1115af75cd3cSVivek Goyal 	 * just update the dispatch stats in lockless manner and return.
1116af75cd3cSVivek Goyal 	 */
1117af75cd3cSVivek Goyal 	rcu_read_lock();
11184f85cb96STejun Heo 	blkcg = bio_blkio_cgroup(bio);
1119cd1604faSTejun Heo 	tg = throtl_lookup_tg(td, blkcg);
1120af75cd3cSVivek Goyal 	if (tg) {
1121af75cd3cSVivek Goyal 		if (tg_no_rule_group(tg, rw)) {
1122629ed0b1STejun Heo 			throtl_update_dispatch_stats(tg_to_blkg(tg),
1123629ed0b1STejun Heo 						     bio->bi_size, bio->bi_rw);
11242a7f1244STejun Heo 			goto out_unlock_rcu;
1125af75cd3cSVivek Goyal 		}
1126af75cd3cSVivek Goyal 	}
1127af75cd3cSVivek Goyal 
1128af75cd3cSVivek Goyal 	/*
1129af75cd3cSVivek Goyal 	 * Either group has not been allocated yet or it is not an unlimited
1130af75cd3cSVivek Goyal 	 * IO group
1131af75cd3cSVivek Goyal 	 */
1132e43473b7SVivek Goyal 	spin_lock_irq(q->queue_lock);
1133cd1604faSTejun Heo 	tg = throtl_lookup_create_tg(td, blkcg);
1134bc16a4f9STejun Heo 	if (unlikely(!tg))
1135bc16a4f9STejun Heo 		goto out_unlock;
1136f469a7b4SVivek Goyal 
1137e43473b7SVivek Goyal 	if (tg->nr_queued[rw]) {
1138e43473b7SVivek Goyal 		/*
1139e43473b7SVivek Goyal 		 * There is already another bio queued in same dir. No
1140e43473b7SVivek Goyal 		 * need to update dispatch time.
1141e43473b7SVivek Goyal 		 */
1142e43473b7SVivek Goyal 		update_disptime = false;
1143e43473b7SVivek Goyal 		goto queue_bio;
1144de701c74SVivek Goyal 
1145e43473b7SVivek Goyal 	}
1146e43473b7SVivek Goyal 
1147e43473b7SVivek Goyal 	/* Bio is with-in rate limit of group */
1148e43473b7SVivek Goyal 	if (tg_may_dispatch(td, tg, bio, NULL)) {
1149e43473b7SVivek Goyal 		throtl_charge_bio(tg, bio);
115004521db0SVivek Goyal 
115104521db0SVivek Goyal 		/*
115204521db0SVivek Goyal 		 * We need to trim slice even when bios are not being queued
115304521db0SVivek Goyal 		 * otherwise it might happen that a bio is not queued for
115404521db0SVivek Goyal 		 * a long time and slice keeps on extending and trim is not
115504521db0SVivek Goyal 		 * called for a long time. Now if limits are reduced suddenly
115604521db0SVivek Goyal 		 * we take into account all the IO dispatched so far at new
115704521db0SVivek Goyal 		 * low rate and * newly queued IO gets a really long dispatch
115804521db0SVivek Goyal 		 * time.
115904521db0SVivek Goyal 		 *
116004521db0SVivek Goyal 		 * So keep on trimming slice even if bio is not queued.
116104521db0SVivek Goyal 		 */
116204521db0SVivek Goyal 		throtl_trim_slice(td, tg, rw);
1163bc16a4f9STejun Heo 		goto out_unlock;
1164e43473b7SVivek Goyal 	}
1165e43473b7SVivek Goyal 
1166e43473b7SVivek Goyal queue_bio:
1167fd16d263SJoe Perches 	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
11688e89d13fSVivek Goyal 			" iodisp=%u iops=%u queued=%d/%d",
11698e89d13fSVivek Goyal 			rw == READ ? 'R' : 'W',
1170e43473b7SVivek Goyal 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
11718e89d13fSVivek Goyal 			tg->io_disp[rw], tg->iops[rw],
1172e43473b7SVivek Goyal 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
1173e43473b7SVivek Goyal 
1174671058fbSTejun Heo 	bio_associate_current(bio);
1175e43473b7SVivek Goyal 	throtl_add_bio_tg(q->td, tg, bio);
1176bc16a4f9STejun Heo 	throttled = true;
1177e43473b7SVivek Goyal 
1178e43473b7SVivek Goyal 	if (update_disptime) {
1179e43473b7SVivek Goyal 		tg_update_disptime(td, tg);
1180e43473b7SVivek Goyal 		throtl_schedule_next_dispatch(td);
1181e43473b7SVivek Goyal 	}
1182e43473b7SVivek Goyal 
1183bc16a4f9STejun Heo out_unlock:
1184e43473b7SVivek Goyal 	spin_unlock_irq(q->queue_lock);
11852a7f1244STejun Heo out_unlock_rcu:
11862a7f1244STejun Heo 	rcu_read_unlock();
1187bc16a4f9STejun Heo out:
1188bc16a4f9STejun Heo 	return throttled;
1189e43473b7SVivek Goyal }
1190e43473b7SVivek Goyal 
1191c9a929ddSTejun Heo /**
1192c9a929ddSTejun Heo  * blk_throtl_drain - drain throttled bios
1193c9a929ddSTejun Heo  * @q: request_queue to drain throttled bios for
1194c9a929ddSTejun Heo  *
1195c9a929ddSTejun Heo  * Dispatch all currently throttled bios on @q through ->make_request_fn().
1196c9a929ddSTejun Heo  */
1197c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q)
1198c9a929ddSTejun Heo 	__releases(q->queue_lock) __acquires(q->queue_lock)
1199c9a929ddSTejun Heo {
1200c9a929ddSTejun Heo 	struct throtl_data *td = q->td;
1201c9a929ddSTejun Heo 	struct throtl_rb_root *st = &td->tg_service_tree;
1202c9a929ddSTejun Heo 	struct throtl_grp *tg;
1203c9a929ddSTejun Heo 	struct bio_list bl;
1204c9a929ddSTejun Heo 	struct bio *bio;
1205c9a929ddSTejun Heo 
1206334c2b0bSJens Axboe 	WARN_ON_ONCE(!queue_is_locked(q));
1207c9a929ddSTejun Heo 
1208c9a929ddSTejun Heo 	bio_list_init(&bl);
1209c9a929ddSTejun Heo 
1210c9a929ddSTejun Heo 	while ((tg = throtl_rb_first(st))) {
1211c9a929ddSTejun Heo 		throtl_dequeue_tg(td, tg);
1212c9a929ddSTejun Heo 
1213c9a929ddSTejun Heo 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
1214c9a929ddSTejun Heo 			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1215c9a929ddSTejun Heo 		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
1216c9a929ddSTejun Heo 			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
1217c9a929ddSTejun Heo 	}
1218c9a929ddSTejun Heo 	spin_unlock_irq(q->queue_lock);
1219c9a929ddSTejun Heo 
1220c9a929ddSTejun Heo 	while ((bio = bio_list_pop(&bl)))
1221c9a929ddSTejun Heo 		generic_make_request(bio);
1222c9a929ddSTejun Heo 
1223c9a929ddSTejun Heo 	spin_lock_irq(q->queue_lock);
1224c9a929ddSTejun Heo }
1225c9a929ddSTejun Heo 
1226e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q)
1227e43473b7SVivek Goyal {
1228e43473b7SVivek Goyal 	struct throtl_data *td;
1229*a2b1693bSTejun Heo 	int ret;
1230e43473b7SVivek Goyal 
1231e43473b7SVivek Goyal 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1232e43473b7SVivek Goyal 	if (!td)
1233e43473b7SVivek Goyal 		return -ENOMEM;
1234e43473b7SVivek Goyal 
1235e43473b7SVivek Goyal 	td->tg_service_tree = THROTL_RB_ROOT;
1236de701c74SVivek Goyal 	td->limits_changed = false;
1237a29a171eSVivek Goyal 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1238e43473b7SVivek Goyal 
1239cd1604faSTejun Heo 	q->td = td;
124029b12589SVivek Goyal 	td->queue = q;
124102977e4aSVivek Goyal 
1242*a2b1693bSTejun Heo 	/* activate policy */
1243*a2b1693bSTejun Heo 	ret = blkcg_activate_policy(q, &blkio_policy_throtl);
1244*a2b1693bSTejun Heo 	if (ret)
124529b12589SVivek Goyal 		kfree(td);
1246*a2b1693bSTejun Heo 	return ret;
1247e43473b7SVivek Goyal }
1248e43473b7SVivek Goyal 
1249e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q)
1250e43473b7SVivek Goyal {
1251c875f4d0STejun Heo 	BUG_ON(!q->td);
1252da527770SVivek Goyal 	throtl_shutdown_wq(q);
1253*a2b1693bSTejun Heo 	blkcg_deactivate_policy(q, &blkio_policy_throtl);
1254c9a929ddSTejun Heo 	kfree(q->td);
1255e43473b7SVivek Goyal }
1256e43473b7SVivek Goyal 
1257e43473b7SVivek Goyal static int __init throtl_init(void)
1258e43473b7SVivek Goyal {
1259450adcbeSVivek Goyal 	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
1260450adcbeSVivek Goyal 	if (!kthrotld_workqueue)
1261450adcbeSVivek Goyal 		panic("Failed to create kthrotld\n");
1262450adcbeSVivek Goyal 
12638bd435b3STejun Heo 	return blkio_policy_register(&blkio_policy_throtl);
1264e43473b7SVivek Goyal }
1265e43473b7SVivek Goyal 
1266e43473b7SVivek Goyal module_init(throtl_init);
1267