xref: /linux/block/blk-throttle.c (revision 5e901a2b95db709c5e40599ff4df6029be1e2a12)
1e43473b7SVivek Goyal /*
2e43473b7SVivek Goyal  * Interface for controlling IO bandwidth on a request queue
3e43473b7SVivek Goyal  *
4e43473b7SVivek Goyal  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5e43473b7SVivek Goyal  */
6e43473b7SVivek Goyal 
7e43473b7SVivek Goyal #include <linux/module.h>
8e43473b7SVivek Goyal #include <linux/slab.h>
9e43473b7SVivek Goyal #include <linux/blkdev.h>
10e43473b7SVivek Goyal #include <linux/bio.h>
11e43473b7SVivek Goyal #include <linux/blktrace_api.h>
12e43473b7SVivek Goyal #include "blk-cgroup.h"
13e43473b7SVivek Goyal 
14e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */
15e43473b7SVivek Goyal static int throtl_grp_quantum = 8;
16e43473b7SVivek Goyal 
17e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */
18e43473b7SVivek Goyal static int throtl_quantum = 32;
19e43473b7SVivek Goyal 
20e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */
21e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10;	/* 100 ms */
22e43473b7SVivek Goyal 
23e43473b7SVivek Goyal struct throtl_rb_root {
24e43473b7SVivek Goyal 	struct rb_root rb;
25e43473b7SVivek Goyal 	struct rb_node *left;
26e43473b7SVivek Goyal 	unsigned int count;
27e43473b7SVivek Goyal 	unsigned long min_disptime;
28e43473b7SVivek Goyal };
29e43473b7SVivek Goyal 
30e43473b7SVivek Goyal #define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31e43473b7SVivek Goyal 			.count = 0, .min_disptime = 0}
32e43473b7SVivek Goyal 
33e43473b7SVivek Goyal #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
34e43473b7SVivek Goyal 
35e43473b7SVivek Goyal struct throtl_grp {
36e43473b7SVivek Goyal 	/* List of throtl groups on the request queue*/
37e43473b7SVivek Goyal 	struct hlist_node tg_node;
38e43473b7SVivek Goyal 
39e43473b7SVivek Goyal 	/* active throtl group service_tree member */
40e43473b7SVivek Goyal 	struct rb_node rb_node;
41e43473b7SVivek Goyal 
42e43473b7SVivek Goyal 	/*
43e43473b7SVivek Goyal 	 * Dispatch time in jiffies. This is the estimated time when group
44e43473b7SVivek Goyal 	 * will unthrottle and is ready to dispatch more bio. It is used as
45e43473b7SVivek Goyal 	 * key to sort active groups in service tree.
46e43473b7SVivek Goyal 	 */
47e43473b7SVivek Goyal 	unsigned long disptime;
48e43473b7SVivek Goyal 
49e43473b7SVivek Goyal 	struct blkio_group blkg;
50e43473b7SVivek Goyal 	atomic_t ref;
51e43473b7SVivek Goyal 	unsigned int flags;
52e43473b7SVivek Goyal 
53e43473b7SVivek Goyal 	/* Two lists for READ and WRITE */
54e43473b7SVivek Goyal 	struct bio_list bio_lists[2];
55e43473b7SVivek Goyal 
56e43473b7SVivek Goyal 	/* Number of queued bios on READ and WRITE lists */
57e43473b7SVivek Goyal 	unsigned int nr_queued[2];
58e43473b7SVivek Goyal 
59e43473b7SVivek Goyal 	/* bytes per second rate limits */
60e43473b7SVivek Goyal 	uint64_t bps[2];
61e43473b7SVivek Goyal 
628e89d13fSVivek Goyal 	/* IOPS limits */
638e89d13fSVivek Goyal 	unsigned int iops[2];
648e89d13fSVivek Goyal 
65e43473b7SVivek Goyal 	/* Number of bytes disptached in current slice */
66e43473b7SVivek Goyal 	uint64_t bytes_disp[2];
678e89d13fSVivek Goyal 	/* Number of bio's dispatched in current slice */
688e89d13fSVivek Goyal 	unsigned int io_disp[2];
69e43473b7SVivek Goyal 
70e43473b7SVivek Goyal 	/* When did we start a new slice */
71e43473b7SVivek Goyal 	unsigned long slice_start[2];
72e43473b7SVivek Goyal 	unsigned long slice_end[2];
73fe071437SVivek Goyal 
74fe071437SVivek Goyal 	/* Some throttle limits got updated for the group */
75fe071437SVivek Goyal 	bool limits_changed;
76e43473b7SVivek Goyal };
77e43473b7SVivek Goyal 
78e43473b7SVivek Goyal struct throtl_data
79e43473b7SVivek Goyal {
80e43473b7SVivek Goyal 	/* List of throtl groups */
81e43473b7SVivek Goyal 	struct hlist_head tg_list;
82e43473b7SVivek Goyal 
83e43473b7SVivek Goyal 	/* service tree for active throtl groups */
84e43473b7SVivek Goyal 	struct throtl_rb_root tg_service_tree;
85e43473b7SVivek Goyal 
86e43473b7SVivek Goyal 	struct throtl_grp root_tg;
87e43473b7SVivek Goyal 	struct request_queue *queue;
88e43473b7SVivek Goyal 
89e43473b7SVivek Goyal 	/* Total Number of queued bios on READ and WRITE lists */
90e43473b7SVivek Goyal 	unsigned int nr_queued[2];
91e43473b7SVivek Goyal 
92e43473b7SVivek Goyal 	/*
9302977e4aSVivek Goyal 	 * number of total undestroyed groups
94e43473b7SVivek Goyal 	 */
95e43473b7SVivek Goyal 	unsigned int nr_undestroyed_grps;
96e43473b7SVivek Goyal 
97e43473b7SVivek Goyal 	/* Work for dispatching throttled bios */
98e43473b7SVivek Goyal 	struct delayed_work throtl_work;
99fe071437SVivek Goyal 
100fe071437SVivek Goyal 	atomic_t limits_changed;
101e43473b7SVivek Goyal };
102e43473b7SVivek Goyal 
103e43473b7SVivek Goyal enum tg_state_flags {
104e43473b7SVivek Goyal 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
105e43473b7SVivek Goyal };
106e43473b7SVivek Goyal 
107e43473b7SVivek Goyal #define THROTL_TG_FNS(name)						\
108e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
109e43473b7SVivek Goyal {									\
110e43473b7SVivek Goyal 	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
111e43473b7SVivek Goyal }									\
112e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
113e43473b7SVivek Goyal {									\
114e43473b7SVivek Goyal 	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
115e43473b7SVivek Goyal }									\
116e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
117e43473b7SVivek Goyal {									\
118e43473b7SVivek Goyal 	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
119e43473b7SVivek Goyal }
120e43473b7SVivek Goyal 
121e43473b7SVivek Goyal THROTL_TG_FNS(on_rr);
122e43473b7SVivek Goyal 
123e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...)				\
124e43473b7SVivek Goyal 	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
125e43473b7SVivek Goyal 				blkg_path(&(tg)->blkg), ##args);      	\
126e43473b7SVivek Goyal 
127e43473b7SVivek Goyal #define throtl_log(td, fmt, args...)	\
128e43473b7SVivek Goyal 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
129e43473b7SVivek Goyal 
130e43473b7SVivek Goyal static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
131e43473b7SVivek Goyal {
132e43473b7SVivek Goyal 	if (blkg)
133e43473b7SVivek Goyal 		return container_of(blkg, struct throtl_grp, blkg);
134e43473b7SVivek Goyal 
135e43473b7SVivek Goyal 	return NULL;
136e43473b7SVivek Goyal }
137e43473b7SVivek Goyal 
138e43473b7SVivek Goyal static inline int total_nr_queued(struct throtl_data *td)
139e43473b7SVivek Goyal {
140e43473b7SVivek Goyal 	return (td->nr_queued[0] + td->nr_queued[1]);
141e43473b7SVivek Goyal }
142e43473b7SVivek Goyal 
143e43473b7SVivek Goyal static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
144e43473b7SVivek Goyal {
145e43473b7SVivek Goyal 	atomic_inc(&tg->ref);
146e43473b7SVivek Goyal 	return tg;
147e43473b7SVivek Goyal }
148e43473b7SVivek Goyal 
149e43473b7SVivek Goyal static void throtl_put_tg(struct throtl_grp *tg)
150e43473b7SVivek Goyal {
151e43473b7SVivek Goyal 	BUG_ON(atomic_read(&tg->ref) <= 0);
152e43473b7SVivek Goyal 	if (!atomic_dec_and_test(&tg->ref))
153e43473b7SVivek Goyal 		return;
154e43473b7SVivek Goyal 	kfree(tg);
155e43473b7SVivek Goyal }
156e43473b7SVivek Goyal 
157e43473b7SVivek Goyal static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
158e43473b7SVivek Goyal 			struct cgroup *cgroup)
159e43473b7SVivek Goyal {
160e43473b7SVivek Goyal 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
161e43473b7SVivek Goyal 	struct throtl_grp *tg = NULL;
162e43473b7SVivek Goyal 	void *key = td;
163e43473b7SVivek Goyal 	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
164e43473b7SVivek Goyal 	unsigned int major, minor;
165e43473b7SVivek Goyal 
166e43473b7SVivek Goyal 	/*
167e43473b7SVivek Goyal 	 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
168e43473b7SVivek Goyal 	 * tree of blkg (instead of traversing through hash list all
169e43473b7SVivek Goyal 	 * the time.
170e43473b7SVivek Goyal 	 */
171e43473b7SVivek Goyal 	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172e43473b7SVivek Goyal 
173e43473b7SVivek Goyal 	/* Fill in device details for root group */
174e43473b7SVivek Goyal 	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
175e43473b7SVivek Goyal 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
176e43473b7SVivek Goyal 		tg->blkg.dev = MKDEV(major, minor);
177e43473b7SVivek Goyal 		goto done;
178e43473b7SVivek Goyal 	}
179e43473b7SVivek Goyal 
180e43473b7SVivek Goyal 	if (tg)
181e43473b7SVivek Goyal 		goto done;
182e43473b7SVivek Goyal 
183e43473b7SVivek Goyal 	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
184e43473b7SVivek Goyal 	if (!tg)
185e43473b7SVivek Goyal 		goto done;
186e43473b7SVivek Goyal 
187e43473b7SVivek Goyal 	INIT_HLIST_NODE(&tg->tg_node);
188e43473b7SVivek Goyal 	RB_CLEAR_NODE(&tg->rb_node);
189e43473b7SVivek Goyal 	bio_list_init(&tg->bio_lists[0]);
190e43473b7SVivek Goyal 	bio_list_init(&tg->bio_lists[1]);
191e43473b7SVivek Goyal 
192e43473b7SVivek Goyal 	/*
193e43473b7SVivek Goyal 	 * Take the initial reference that will be released on destroy
194e43473b7SVivek Goyal 	 * This can be thought of a joint reference by cgroup and
195e43473b7SVivek Goyal 	 * request queue which will be dropped by either request queue
196e43473b7SVivek Goyal 	 * exit or cgroup deletion path depending on who is exiting first.
197e43473b7SVivek Goyal 	 */
198e43473b7SVivek Goyal 	atomic_set(&tg->ref, 1);
199e43473b7SVivek Goyal 
200e43473b7SVivek Goyal 	/* Add group onto cgroup list */
201e43473b7SVivek Goyal 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
202e43473b7SVivek Goyal 	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
203e43473b7SVivek Goyal 				MKDEV(major, minor), BLKIO_POLICY_THROTL);
204e43473b7SVivek Goyal 
205e43473b7SVivek Goyal 	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
206e43473b7SVivek Goyal 	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
2078e89d13fSVivek Goyal 	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
2088e89d13fSVivek Goyal 	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
209e43473b7SVivek Goyal 
210e43473b7SVivek Goyal 	hlist_add_head(&tg->tg_node, &td->tg_list);
211e43473b7SVivek Goyal 	td->nr_undestroyed_grps++;
212e43473b7SVivek Goyal done:
213e43473b7SVivek Goyal 	return tg;
214e43473b7SVivek Goyal }
215e43473b7SVivek Goyal 
216e43473b7SVivek Goyal static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
217e43473b7SVivek Goyal {
218e43473b7SVivek Goyal 	struct cgroup *cgroup;
219e43473b7SVivek Goyal 	struct throtl_grp *tg = NULL;
220e43473b7SVivek Goyal 
221e43473b7SVivek Goyal 	rcu_read_lock();
222e43473b7SVivek Goyal 	cgroup = task_cgroup(current, blkio_subsys_id);
223e43473b7SVivek Goyal 	tg = throtl_find_alloc_tg(td, cgroup);
224e43473b7SVivek Goyal 	if (!tg)
225e43473b7SVivek Goyal 		tg = &td->root_tg;
226e43473b7SVivek Goyal 	rcu_read_unlock();
227e43473b7SVivek Goyal 	return tg;
228e43473b7SVivek Goyal }
229e43473b7SVivek Goyal 
230e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
231e43473b7SVivek Goyal {
232e43473b7SVivek Goyal 	/* Service tree is empty */
233e43473b7SVivek Goyal 	if (!root->count)
234e43473b7SVivek Goyal 		return NULL;
235e43473b7SVivek Goyal 
236e43473b7SVivek Goyal 	if (!root->left)
237e43473b7SVivek Goyal 		root->left = rb_first(&root->rb);
238e43473b7SVivek Goyal 
239e43473b7SVivek Goyal 	if (root->left)
240e43473b7SVivek Goyal 		return rb_entry_tg(root->left);
241e43473b7SVivek Goyal 
242e43473b7SVivek Goyal 	return NULL;
243e43473b7SVivek Goyal }
244e43473b7SVivek Goyal 
245e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root)
246e43473b7SVivek Goyal {
247e43473b7SVivek Goyal 	rb_erase(n, root);
248e43473b7SVivek Goyal 	RB_CLEAR_NODE(n);
249e43473b7SVivek Goyal }
250e43473b7SVivek Goyal 
251e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
252e43473b7SVivek Goyal {
253e43473b7SVivek Goyal 	if (root->left == n)
254e43473b7SVivek Goyal 		root->left = NULL;
255e43473b7SVivek Goyal 	rb_erase_init(n, &root->rb);
256e43473b7SVivek Goyal 	--root->count;
257e43473b7SVivek Goyal }
258e43473b7SVivek Goyal 
259e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st)
260e43473b7SVivek Goyal {
261e43473b7SVivek Goyal 	struct throtl_grp *tg;
262e43473b7SVivek Goyal 
263e43473b7SVivek Goyal 	tg = throtl_rb_first(st);
264e43473b7SVivek Goyal 	if (!tg)
265e43473b7SVivek Goyal 		return;
266e43473b7SVivek Goyal 
267e43473b7SVivek Goyal 	st->min_disptime = tg->disptime;
268e43473b7SVivek Goyal }
269e43473b7SVivek Goyal 
270e43473b7SVivek Goyal static void
271e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
272e43473b7SVivek Goyal {
273e43473b7SVivek Goyal 	struct rb_node **node = &st->rb.rb_node;
274e43473b7SVivek Goyal 	struct rb_node *parent = NULL;
275e43473b7SVivek Goyal 	struct throtl_grp *__tg;
276e43473b7SVivek Goyal 	unsigned long key = tg->disptime;
277e43473b7SVivek Goyal 	int left = 1;
278e43473b7SVivek Goyal 
279e43473b7SVivek Goyal 	while (*node != NULL) {
280e43473b7SVivek Goyal 		parent = *node;
281e43473b7SVivek Goyal 		__tg = rb_entry_tg(parent);
282e43473b7SVivek Goyal 
283e43473b7SVivek Goyal 		if (time_before(key, __tg->disptime))
284e43473b7SVivek Goyal 			node = &parent->rb_left;
285e43473b7SVivek Goyal 		else {
286e43473b7SVivek Goyal 			node = &parent->rb_right;
287e43473b7SVivek Goyal 			left = 0;
288e43473b7SVivek Goyal 		}
289e43473b7SVivek Goyal 	}
290e43473b7SVivek Goyal 
291e43473b7SVivek Goyal 	if (left)
292e43473b7SVivek Goyal 		st->left = &tg->rb_node;
293e43473b7SVivek Goyal 
294e43473b7SVivek Goyal 	rb_link_node(&tg->rb_node, parent, node);
295e43473b7SVivek Goyal 	rb_insert_color(&tg->rb_node, &st->rb);
296e43473b7SVivek Goyal }
297e43473b7SVivek Goyal 
298e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
299e43473b7SVivek Goyal {
300e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
301e43473b7SVivek Goyal 
302e43473b7SVivek Goyal 	tg_service_tree_add(st, tg);
303e43473b7SVivek Goyal 	throtl_mark_tg_on_rr(tg);
304e43473b7SVivek Goyal 	st->count++;
305e43473b7SVivek Goyal }
306e43473b7SVivek Goyal 
307e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
308e43473b7SVivek Goyal {
309e43473b7SVivek Goyal 	if (!throtl_tg_on_rr(tg))
310e43473b7SVivek Goyal 		__throtl_enqueue_tg(td, tg);
311e43473b7SVivek Goyal }
312e43473b7SVivek Goyal 
313e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
314e43473b7SVivek Goyal {
315e43473b7SVivek Goyal 	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
316e43473b7SVivek Goyal 	throtl_clear_tg_on_rr(tg);
317e43473b7SVivek Goyal }
318e43473b7SVivek Goyal 
319e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
320e43473b7SVivek Goyal {
321e43473b7SVivek Goyal 	if (throtl_tg_on_rr(tg))
322e43473b7SVivek Goyal 		__throtl_dequeue_tg(td, tg);
323e43473b7SVivek Goyal }
324e43473b7SVivek Goyal 
325e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td)
326e43473b7SVivek Goyal {
327e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
328e43473b7SVivek Goyal 
329e43473b7SVivek Goyal 	/*
330e43473b7SVivek Goyal 	 * If there are more bios pending, schedule more work.
331e43473b7SVivek Goyal 	 */
332e43473b7SVivek Goyal 	if (!total_nr_queued(td))
333e43473b7SVivek Goyal 		return;
334e43473b7SVivek Goyal 
335e43473b7SVivek Goyal 	BUG_ON(!st->count);
336e43473b7SVivek Goyal 
337e43473b7SVivek Goyal 	update_min_dispatch_time(st);
338e43473b7SVivek Goyal 
339e43473b7SVivek Goyal 	if (time_before_eq(st->min_disptime, jiffies))
340e43473b7SVivek Goyal 		throtl_schedule_delayed_work(td->queue, 0);
341e43473b7SVivek Goyal 	else
342e43473b7SVivek Goyal 		throtl_schedule_delayed_work(td->queue,
343e43473b7SVivek Goyal 				(st->min_disptime - jiffies));
344e43473b7SVivek Goyal }
345e43473b7SVivek Goyal 
346e43473b7SVivek Goyal static inline void
347e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
348e43473b7SVivek Goyal {
349e43473b7SVivek Goyal 	tg->bytes_disp[rw] = 0;
3508e89d13fSVivek Goyal 	tg->io_disp[rw] = 0;
351e43473b7SVivek Goyal 	tg->slice_start[rw] = jiffies;
352e43473b7SVivek Goyal 	tg->slice_end[rw] = jiffies + throtl_slice;
353e43473b7SVivek Goyal 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
354e43473b7SVivek Goyal 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
355e43473b7SVivek Goyal 			tg->slice_end[rw], jiffies);
356e43473b7SVivek Goyal }
357e43473b7SVivek Goyal 
358e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td,
359e43473b7SVivek Goyal 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
360e43473b7SVivek Goyal {
361e43473b7SVivek Goyal 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
362e43473b7SVivek Goyal 	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
363e43473b7SVivek Goyal 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
364e43473b7SVivek Goyal 			tg->slice_end[rw], jiffies);
365e43473b7SVivek Goyal }
366e43473b7SVivek Goyal 
367e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */
368e43473b7SVivek Goyal static bool
369e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
370e43473b7SVivek Goyal {
371e43473b7SVivek Goyal 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
372e43473b7SVivek Goyal 		return 0;
373e43473b7SVivek Goyal 
374e43473b7SVivek Goyal 	return 1;
375e43473b7SVivek Goyal }
376e43473b7SVivek Goyal 
377e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */
378e43473b7SVivek Goyal static inline void
379e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
380e43473b7SVivek Goyal {
3813aad5d3eSVivek Goyal 	unsigned long nr_slices, time_elapsed, io_trim;
3823aad5d3eSVivek Goyal 	u64 bytes_trim, tmp;
383e43473b7SVivek Goyal 
384e43473b7SVivek Goyal 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
385e43473b7SVivek Goyal 
386e43473b7SVivek Goyal 	/*
387e43473b7SVivek Goyal 	 * If bps are unlimited (-1), then time slice don't get
388e43473b7SVivek Goyal 	 * renewed. Don't try to trim the slice if slice is used. A new
389e43473b7SVivek Goyal 	 * slice will start when appropriate.
390e43473b7SVivek Goyal 	 */
391e43473b7SVivek Goyal 	if (throtl_slice_used(td, tg, rw))
392e43473b7SVivek Goyal 		return;
393e43473b7SVivek Goyal 
394e43473b7SVivek Goyal 	time_elapsed = jiffies - tg->slice_start[rw];
395e43473b7SVivek Goyal 
396e43473b7SVivek Goyal 	nr_slices = time_elapsed / throtl_slice;
397e43473b7SVivek Goyal 
398e43473b7SVivek Goyal 	if (!nr_slices)
399e43473b7SVivek Goyal 		return;
4003aad5d3eSVivek Goyal 	tmp = tg->bps[rw] * throtl_slice * nr_slices;
4013aad5d3eSVivek Goyal 	do_div(tmp, HZ);
4023aad5d3eSVivek Goyal 	bytes_trim = tmp;
403e43473b7SVivek Goyal 
4048e89d13fSVivek Goyal 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
405e43473b7SVivek Goyal 
4068e89d13fSVivek Goyal 	if (!bytes_trim && !io_trim)
407e43473b7SVivek Goyal 		return;
408e43473b7SVivek Goyal 
409e43473b7SVivek Goyal 	if (tg->bytes_disp[rw] >= bytes_trim)
410e43473b7SVivek Goyal 		tg->bytes_disp[rw] -= bytes_trim;
411e43473b7SVivek Goyal 	else
412e43473b7SVivek Goyal 		tg->bytes_disp[rw] = 0;
413e43473b7SVivek Goyal 
4148e89d13fSVivek Goyal 	if (tg->io_disp[rw] >= io_trim)
4158e89d13fSVivek Goyal 		tg->io_disp[rw] -= io_trim;
4168e89d13fSVivek Goyal 	else
4178e89d13fSVivek Goyal 		tg->io_disp[rw] = 0;
4188e89d13fSVivek Goyal 
419e43473b7SVivek Goyal 	tg->slice_start[rw] += nr_slices * throtl_slice;
420e43473b7SVivek Goyal 
4213aad5d3eSVivek Goyal 	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
422e43473b7SVivek Goyal 			" start=%lu end=%lu jiffies=%lu",
4238e89d13fSVivek Goyal 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
424e43473b7SVivek Goyal 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
425e43473b7SVivek Goyal }
426e43473b7SVivek Goyal 
4278e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
428e43473b7SVivek Goyal 		struct bio *bio, unsigned long *wait)
429e43473b7SVivek Goyal {
430e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
4318e89d13fSVivek Goyal 	unsigned int io_allowed;
432e43473b7SVivek Goyal 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
433e43473b7SVivek Goyal 
4348e89d13fSVivek Goyal 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
435e43473b7SVivek Goyal 
4368e89d13fSVivek Goyal 	/* Slice has just started. Consider one slice interval */
4378e89d13fSVivek Goyal 	if (!jiffy_elapsed)
4388e89d13fSVivek Goyal 		jiffy_elapsed_rnd = throtl_slice;
4398e89d13fSVivek Goyal 
4408e89d13fSVivek Goyal 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
4418e89d13fSVivek Goyal 
442*5e901a2bSVivek Goyal 	io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ;
4438e89d13fSVivek Goyal 
4448e89d13fSVivek Goyal 	if (tg->io_disp[rw] + 1 <= io_allowed) {
445e43473b7SVivek Goyal 		if (wait)
446e43473b7SVivek Goyal 			*wait = 0;
447e43473b7SVivek Goyal 		return 1;
448e43473b7SVivek Goyal 	}
449e43473b7SVivek Goyal 
4508e89d13fSVivek Goyal 	/* Calc approx time to dispatch */
4518e89d13fSVivek Goyal 	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
4528e89d13fSVivek Goyal 
4538e89d13fSVivek Goyal 	if (jiffy_wait > jiffy_elapsed)
4548e89d13fSVivek Goyal 		jiffy_wait = jiffy_wait - jiffy_elapsed;
4558e89d13fSVivek Goyal 	else
4568e89d13fSVivek Goyal 		jiffy_wait = 1;
4578e89d13fSVivek Goyal 
4588e89d13fSVivek Goyal 	if (wait)
4598e89d13fSVivek Goyal 		*wait = jiffy_wait;
4608e89d13fSVivek Goyal 	return 0;
461e43473b7SVivek Goyal }
462e43473b7SVivek Goyal 
4638e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
4648e89d13fSVivek Goyal 		struct bio *bio, unsigned long *wait)
4658e89d13fSVivek Goyal {
4668e89d13fSVivek Goyal 	bool rw = bio_data_dir(bio);
4673aad5d3eSVivek Goyal 	u64 bytes_allowed, extra_bytes, tmp;
4688e89d13fSVivek Goyal 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
4698e89d13fSVivek Goyal 
470e43473b7SVivek Goyal 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
471e43473b7SVivek Goyal 
472e43473b7SVivek Goyal 	/* Slice has just started. Consider one slice interval */
473e43473b7SVivek Goyal 	if (!jiffy_elapsed)
474e43473b7SVivek Goyal 		jiffy_elapsed_rnd = throtl_slice;
475e43473b7SVivek Goyal 
476e43473b7SVivek Goyal 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
477e43473b7SVivek Goyal 
478*5e901a2bSVivek Goyal 	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
479*5e901a2bSVivek Goyal 	do_div(tmp, HZ);
4803aad5d3eSVivek Goyal 	bytes_allowed = tmp;
481e43473b7SVivek Goyal 
482e43473b7SVivek Goyal 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
483e43473b7SVivek Goyal 		if (wait)
484e43473b7SVivek Goyal 			*wait = 0;
485e43473b7SVivek Goyal 		return 1;
486e43473b7SVivek Goyal 	}
487e43473b7SVivek Goyal 
488e43473b7SVivek Goyal 	/* Calc approx time to dispatch */
489e43473b7SVivek Goyal 	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
490e43473b7SVivek Goyal 	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
491e43473b7SVivek Goyal 
492e43473b7SVivek Goyal 	if (!jiffy_wait)
493e43473b7SVivek Goyal 		jiffy_wait = 1;
494e43473b7SVivek Goyal 
495e43473b7SVivek Goyal 	/*
496e43473b7SVivek Goyal 	 * This wait time is without taking into consideration the rounding
497e43473b7SVivek Goyal 	 * up we did. Add that time also.
498e43473b7SVivek Goyal 	 */
499e43473b7SVivek Goyal 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
500e43473b7SVivek Goyal 	if (wait)
501e43473b7SVivek Goyal 		*wait = jiffy_wait;
5028e89d13fSVivek Goyal 	return 0;
5038e89d13fSVivek Goyal }
504e43473b7SVivek Goyal 
5058e89d13fSVivek Goyal /*
5068e89d13fSVivek Goyal  * Returns whether one can dispatch a bio or not. Also returns approx number
5078e89d13fSVivek Goyal  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
5088e89d13fSVivek Goyal  */
5098e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
5108e89d13fSVivek Goyal 				struct bio *bio, unsigned long *wait)
5118e89d13fSVivek Goyal {
5128e89d13fSVivek Goyal 	bool rw = bio_data_dir(bio);
5138e89d13fSVivek Goyal 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
5148e89d13fSVivek Goyal 
5158e89d13fSVivek Goyal 	/*
5168e89d13fSVivek Goyal  	 * Currently whole state machine of group depends on first bio
5178e89d13fSVivek Goyal 	 * queued in the group bio list. So one should not be calling
5188e89d13fSVivek Goyal 	 * this function with a different bio if there are other bios
5198e89d13fSVivek Goyal 	 * queued.
5208e89d13fSVivek Goyal 	 */
5218e89d13fSVivek Goyal 	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
5228e89d13fSVivek Goyal 
5238e89d13fSVivek Goyal 	/* If tg->bps = -1, then BW is unlimited */
5248e89d13fSVivek Goyal 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
5258e89d13fSVivek Goyal 		if (wait)
5268e89d13fSVivek Goyal 			*wait = 0;
5278e89d13fSVivek Goyal 		return 1;
5288e89d13fSVivek Goyal 	}
5298e89d13fSVivek Goyal 
5308e89d13fSVivek Goyal 	/*
5318e89d13fSVivek Goyal 	 * If previous slice expired, start a new one otherwise renew/extend
5328e89d13fSVivek Goyal 	 * existing slice to make sure it is at least throtl_slice interval
5338e89d13fSVivek Goyal 	 * long since now.
5348e89d13fSVivek Goyal 	 */
5358e89d13fSVivek Goyal 	if (throtl_slice_used(td, tg, rw))
5368e89d13fSVivek Goyal 		throtl_start_new_slice(td, tg, rw);
5378e89d13fSVivek Goyal 	else {
5388e89d13fSVivek Goyal 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
5398e89d13fSVivek Goyal 			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
5408e89d13fSVivek Goyal 	}
5418e89d13fSVivek Goyal 
5428e89d13fSVivek Goyal 	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
5438e89d13fSVivek Goyal 	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
5448e89d13fSVivek Goyal 		if (wait)
5458e89d13fSVivek Goyal 			*wait = 0;
5468e89d13fSVivek Goyal 		return 1;
5478e89d13fSVivek Goyal 	}
5488e89d13fSVivek Goyal 
5498e89d13fSVivek Goyal 	max_wait = max(bps_wait, iops_wait);
5508e89d13fSVivek Goyal 
5518e89d13fSVivek Goyal 	if (wait)
5528e89d13fSVivek Goyal 		*wait = max_wait;
5538e89d13fSVivek Goyal 
5548e89d13fSVivek Goyal 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
5558e89d13fSVivek Goyal 		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
556e43473b7SVivek Goyal 
557e43473b7SVivek Goyal 	return 0;
558e43473b7SVivek Goyal }
559e43473b7SVivek Goyal 
560e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
561e43473b7SVivek Goyal {
562e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
563e43473b7SVivek Goyal 	bool sync = bio->bi_rw & REQ_SYNC;
564e43473b7SVivek Goyal 
565e43473b7SVivek Goyal 	/* Charge the bio to the group */
566e43473b7SVivek Goyal 	tg->bytes_disp[rw] += bio->bi_size;
5678e89d13fSVivek Goyal 	tg->io_disp[rw]++;
568e43473b7SVivek Goyal 
569e43473b7SVivek Goyal 	/*
570e43473b7SVivek Goyal 	 * TODO: This will take blkg->stats_lock. Figure out a way
571e43473b7SVivek Goyal 	 * to avoid this cost.
572e43473b7SVivek Goyal 	 */
573e43473b7SVivek Goyal 	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
574e43473b7SVivek Goyal }
575e43473b7SVivek Goyal 
576e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
577e43473b7SVivek Goyal 			struct bio *bio)
578e43473b7SVivek Goyal {
579e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio);
580e43473b7SVivek Goyal 
581e43473b7SVivek Goyal 	bio_list_add(&tg->bio_lists[rw], bio);
582e43473b7SVivek Goyal 	/* Take a bio reference on tg */
583e43473b7SVivek Goyal 	throtl_ref_get_tg(tg);
584e43473b7SVivek Goyal 	tg->nr_queued[rw]++;
585e43473b7SVivek Goyal 	td->nr_queued[rw]++;
586e43473b7SVivek Goyal 	throtl_enqueue_tg(td, tg);
587e43473b7SVivek Goyal }
588e43473b7SVivek Goyal 
589e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
590e43473b7SVivek Goyal {
591e43473b7SVivek Goyal 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
592e43473b7SVivek Goyal 	struct bio *bio;
593e43473b7SVivek Goyal 
594e43473b7SVivek Goyal 	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
595e43473b7SVivek Goyal 		tg_may_dispatch(td, tg, bio, &read_wait);
596e43473b7SVivek Goyal 
597e43473b7SVivek Goyal 	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
598e43473b7SVivek Goyal 		tg_may_dispatch(td, tg, bio, &write_wait);
599e43473b7SVivek Goyal 
600e43473b7SVivek Goyal 	min_wait = min(read_wait, write_wait);
601e43473b7SVivek Goyal 	disptime = jiffies + min_wait;
602e43473b7SVivek Goyal 
603e43473b7SVivek Goyal 	/* Update dispatch time */
604e43473b7SVivek Goyal 	throtl_dequeue_tg(td, tg);
605e43473b7SVivek Goyal 	tg->disptime = disptime;
606e43473b7SVivek Goyal 	throtl_enqueue_tg(td, tg);
607e43473b7SVivek Goyal }
608e43473b7SVivek Goyal 
609e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
610e43473b7SVivek Goyal 				bool rw, struct bio_list *bl)
611e43473b7SVivek Goyal {
612e43473b7SVivek Goyal 	struct bio *bio;
613e43473b7SVivek Goyal 
614e43473b7SVivek Goyal 	bio = bio_list_pop(&tg->bio_lists[rw]);
615e43473b7SVivek Goyal 	tg->nr_queued[rw]--;
616e43473b7SVivek Goyal 	/* Drop bio reference on tg */
617e43473b7SVivek Goyal 	throtl_put_tg(tg);
618e43473b7SVivek Goyal 
619e43473b7SVivek Goyal 	BUG_ON(td->nr_queued[rw] <= 0);
620e43473b7SVivek Goyal 	td->nr_queued[rw]--;
621e43473b7SVivek Goyal 
622e43473b7SVivek Goyal 	throtl_charge_bio(tg, bio);
623e43473b7SVivek Goyal 	bio_list_add(bl, bio);
624e43473b7SVivek Goyal 	bio->bi_rw |= REQ_THROTTLED;
625e43473b7SVivek Goyal 
626e43473b7SVivek Goyal 	throtl_trim_slice(td, tg, rw);
627e43473b7SVivek Goyal }
628e43473b7SVivek Goyal 
629e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
630e43473b7SVivek Goyal 				struct bio_list *bl)
631e43473b7SVivek Goyal {
632e43473b7SVivek Goyal 	unsigned int nr_reads = 0, nr_writes = 0;
633e43473b7SVivek Goyal 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
634e43473b7SVivek Goyal 	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
635e43473b7SVivek Goyal 	struct bio *bio;
636e43473b7SVivek Goyal 
637e43473b7SVivek Goyal 	/* Try to dispatch 75% READS and 25% WRITES */
638e43473b7SVivek Goyal 
639e43473b7SVivek Goyal 	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
640e43473b7SVivek Goyal 		&& tg_may_dispatch(td, tg, bio, NULL)) {
641e43473b7SVivek Goyal 
642e43473b7SVivek Goyal 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
643e43473b7SVivek Goyal 		nr_reads++;
644e43473b7SVivek Goyal 
645e43473b7SVivek Goyal 		if (nr_reads >= max_nr_reads)
646e43473b7SVivek Goyal 			break;
647e43473b7SVivek Goyal 	}
648e43473b7SVivek Goyal 
649e43473b7SVivek Goyal 	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
650e43473b7SVivek Goyal 		&& tg_may_dispatch(td, tg, bio, NULL)) {
651e43473b7SVivek Goyal 
652e43473b7SVivek Goyal 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
653e43473b7SVivek Goyal 		nr_writes++;
654e43473b7SVivek Goyal 
655e43473b7SVivek Goyal 		if (nr_writes >= max_nr_writes)
656e43473b7SVivek Goyal 			break;
657e43473b7SVivek Goyal 	}
658e43473b7SVivek Goyal 
659e43473b7SVivek Goyal 	return nr_reads + nr_writes;
660e43473b7SVivek Goyal }
661e43473b7SVivek Goyal 
662e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
663e43473b7SVivek Goyal {
664e43473b7SVivek Goyal 	unsigned int nr_disp = 0;
665e43473b7SVivek Goyal 	struct throtl_grp *tg;
666e43473b7SVivek Goyal 	struct throtl_rb_root *st = &td->tg_service_tree;
667e43473b7SVivek Goyal 
668e43473b7SVivek Goyal 	while (1) {
669e43473b7SVivek Goyal 		tg = throtl_rb_first(st);
670e43473b7SVivek Goyal 
671e43473b7SVivek Goyal 		if (!tg)
672e43473b7SVivek Goyal 			break;
673e43473b7SVivek Goyal 
674e43473b7SVivek Goyal 		if (time_before(jiffies, tg->disptime))
675e43473b7SVivek Goyal 			break;
676e43473b7SVivek Goyal 
677e43473b7SVivek Goyal 		throtl_dequeue_tg(td, tg);
678e43473b7SVivek Goyal 
679e43473b7SVivek Goyal 		nr_disp += throtl_dispatch_tg(td, tg, bl);
680e43473b7SVivek Goyal 
681e43473b7SVivek Goyal 		if (tg->nr_queued[0] || tg->nr_queued[1]) {
682e43473b7SVivek Goyal 			tg_update_disptime(td, tg);
683e43473b7SVivek Goyal 			throtl_enqueue_tg(td, tg);
684e43473b7SVivek Goyal 		}
685e43473b7SVivek Goyal 
686e43473b7SVivek Goyal 		if (nr_disp >= throtl_quantum)
687e43473b7SVivek Goyal 			break;
688e43473b7SVivek Goyal 	}
689e43473b7SVivek Goyal 
690e43473b7SVivek Goyal 	return nr_disp;
691e43473b7SVivek Goyal }
692e43473b7SVivek Goyal 
693fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td)
694fe071437SVivek Goyal {
695fe071437SVivek Goyal 	struct throtl_grp *tg;
696fe071437SVivek Goyal 	struct hlist_node *pos, *n;
697fe071437SVivek Goyal 
698fe071437SVivek Goyal 	/*
699fe071437SVivek Goyal 	 * Make sure atomic_inc() effects from
700fe071437SVivek Goyal 	 * throtl_update_blkio_group_read_bps(), group of functions are
701fe071437SVivek Goyal 	 * visible.
702fe071437SVivek Goyal 	 * Is this required or smp_mb__after_atomic_inc() was suffcient
703fe071437SVivek Goyal 	 * after the atomic_inc().
704fe071437SVivek Goyal 	 */
705fe071437SVivek Goyal 	smp_rmb();
706fe071437SVivek Goyal 	if (!atomic_read(&td->limits_changed))
707fe071437SVivek Goyal 		return;
708fe071437SVivek Goyal 
709fe071437SVivek Goyal 	throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
710fe071437SVivek Goyal 
711fe071437SVivek Goyal 	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
712fe071437SVivek Goyal 		/*
713fe071437SVivek Goyal 		 * Do I need an smp_rmb() here to make sure tg->limits_changed
714fe071437SVivek Goyal 		 * update is visible. I am relying on smp_rmb() at the
715fe071437SVivek Goyal 		 * beginning of function and not putting a new one here.
716fe071437SVivek Goyal 		 */
717fe071437SVivek Goyal 
718fe071437SVivek Goyal 		if (throtl_tg_on_rr(tg) && tg->limits_changed) {
719fe071437SVivek Goyal 			throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
720fe071437SVivek Goyal 				" riops=%u wiops=%u", tg->bps[READ],
721fe071437SVivek Goyal 				tg->bps[WRITE], tg->iops[READ],
722fe071437SVivek Goyal 				tg->iops[WRITE]);
723fe071437SVivek Goyal 			tg_update_disptime(td, tg);
724fe071437SVivek Goyal 			tg->limits_changed = false;
725fe071437SVivek Goyal 		}
726fe071437SVivek Goyal 	}
727fe071437SVivek Goyal 
728fe071437SVivek Goyal 	smp_mb__before_atomic_dec();
729fe071437SVivek Goyal 	atomic_dec(&td->limits_changed);
730fe071437SVivek Goyal 	smp_mb__after_atomic_dec();
731fe071437SVivek Goyal }
732fe071437SVivek Goyal 
733e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */
734e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q)
735e43473b7SVivek Goyal {
736e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
737e43473b7SVivek Goyal 	unsigned int nr_disp = 0;
738e43473b7SVivek Goyal 	struct bio_list bio_list_on_stack;
739e43473b7SVivek Goyal 	struct bio *bio;
740e43473b7SVivek Goyal 
741e43473b7SVivek Goyal 	spin_lock_irq(q->queue_lock);
742e43473b7SVivek Goyal 
743fe071437SVivek Goyal 	throtl_process_limit_change(td);
744fe071437SVivek Goyal 
745e43473b7SVivek Goyal 	if (!total_nr_queued(td))
746e43473b7SVivek Goyal 		goto out;
747e43473b7SVivek Goyal 
748e43473b7SVivek Goyal 	bio_list_init(&bio_list_on_stack);
749e43473b7SVivek Goyal 
750e43473b7SVivek Goyal 	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
751e43473b7SVivek Goyal 			total_nr_queued(td), td->nr_queued[READ],
752e43473b7SVivek Goyal 			td->nr_queued[WRITE]);
753e43473b7SVivek Goyal 
754e43473b7SVivek Goyal 	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
755e43473b7SVivek Goyal 
756e43473b7SVivek Goyal 	if (nr_disp)
757e43473b7SVivek Goyal 		throtl_log(td, "bios disp=%u", nr_disp);
758e43473b7SVivek Goyal 
759e43473b7SVivek Goyal 	throtl_schedule_next_dispatch(td);
760e43473b7SVivek Goyal out:
761e43473b7SVivek Goyal 	spin_unlock_irq(q->queue_lock);
762e43473b7SVivek Goyal 
763e43473b7SVivek Goyal 	/*
764e43473b7SVivek Goyal 	 * If we dispatched some requests, unplug the queue to make sure
765e43473b7SVivek Goyal 	 * immediate dispatch
766e43473b7SVivek Goyal 	 */
767e43473b7SVivek Goyal 	if (nr_disp) {
768e43473b7SVivek Goyal 		while((bio = bio_list_pop(&bio_list_on_stack)))
769e43473b7SVivek Goyal 			generic_make_request(bio);
770e43473b7SVivek Goyal 		blk_unplug(q);
771e43473b7SVivek Goyal 	}
772e43473b7SVivek Goyal 	return nr_disp;
773e43473b7SVivek Goyal }
774e43473b7SVivek Goyal 
775e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work)
776e43473b7SVivek Goyal {
777e43473b7SVivek Goyal 	struct throtl_data *td = container_of(work, struct throtl_data,
778e43473b7SVivek Goyal 					throtl_work.work);
779e43473b7SVivek Goyal 	struct request_queue *q = td->queue;
780e43473b7SVivek Goyal 
781e43473b7SVivek Goyal 	throtl_dispatch(q);
782e43473b7SVivek Goyal }
783e43473b7SVivek Goyal 
784e43473b7SVivek Goyal /* Call with queue lock held */
785e43473b7SVivek Goyal void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
786e43473b7SVivek Goyal {
787e43473b7SVivek Goyal 
788e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
789e43473b7SVivek Goyal 	struct delayed_work *dwork = &td->throtl_work;
790e43473b7SVivek Goyal 
791e43473b7SVivek Goyal 	if (total_nr_queued(td) > 0) {
792e43473b7SVivek Goyal 		/*
793e43473b7SVivek Goyal 		 * We might have a work scheduled to be executed in future.
794e43473b7SVivek Goyal 		 * Cancel that and schedule a new one.
795e43473b7SVivek Goyal 		 */
796e43473b7SVivek Goyal 		__cancel_delayed_work(dwork);
797e43473b7SVivek Goyal 		kblockd_schedule_delayed_work(q, dwork, delay);
798e43473b7SVivek Goyal 		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
799e43473b7SVivek Goyal 				delay, jiffies);
800e43473b7SVivek Goyal 	}
801e43473b7SVivek Goyal }
802e43473b7SVivek Goyal EXPORT_SYMBOL(throtl_schedule_delayed_work);
803e43473b7SVivek Goyal 
804e43473b7SVivek Goyal static void
805e43473b7SVivek Goyal throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
806e43473b7SVivek Goyal {
807e43473b7SVivek Goyal 	/* Something wrong if we are trying to remove same group twice */
808e43473b7SVivek Goyal 	BUG_ON(hlist_unhashed(&tg->tg_node));
809e43473b7SVivek Goyal 
810e43473b7SVivek Goyal 	hlist_del_init(&tg->tg_node);
811e43473b7SVivek Goyal 
812e43473b7SVivek Goyal 	/*
813e43473b7SVivek Goyal 	 * Put the reference taken at the time of creation so that when all
814e43473b7SVivek Goyal 	 * queues are gone, group can be destroyed.
815e43473b7SVivek Goyal 	 */
816e43473b7SVivek Goyal 	throtl_put_tg(tg);
817e43473b7SVivek Goyal 	td->nr_undestroyed_grps--;
818e43473b7SVivek Goyal }
819e43473b7SVivek Goyal 
820e43473b7SVivek Goyal static void throtl_release_tgs(struct throtl_data *td)
821e43473b7SVivek Goyal {
822e43473b7SVivek Goyal 	struct hlist_node *pos, *n;
823e43473b7SVivek Goyal 	struct throtl_grp *tg;
824e43473b7SVivek Goyal 
825e43473b7SVivek Goyal 	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
826e43473b7SVivek Goyal 		/*
827e43473b7SVivek Goyal 		 * If cgroup removal path got to blk_group first and removed
828e43473b7SVivek Goyal 		 * it from cgroup list, then it will take care of destroying
829e43473b7SVivek Goyal 		 * cfqg also.
830e43473b7SVivek Goyal 		 */
831e43473b7SVivek Goyal 		if (!blkiocg_del_blkio_group(&tg->blkg))
832e43473b7SVivek Goyal 			throtl_destroy_tg(td, tg);
833e43473b7SVivek Goyal 	}
834e43473b7SVivek Goyal }
835e43473b7SVivek Goyal 
836e43473b7SVivek Goyal static void throtl_td_free(struct throtl_data *td)
837e43473b7SVivek Goyal {
838e43473b7SVivek Goyal 	kfree(td);
839e43473b7SVivek Goyal }
840e43473b7SVivek Goyal 
841e43473b7SVivek Goyal /*
842e43473b7SVivek Goyal  * Blk cgroup controller notification saying that blkio_group object is being
843e43473b7SVivek Goyal  * delinked as associated cgroup object is going away. That also means that
844e43473b7SVivek Goyal  * no new IO will come in this group. So get rid of this group as soon as
845e43473b7SVivek Goyal  * any pending IO in the group is finished.
846e43473b7SVivek Goyal  *
847e43473b7SVivek Goyal  * This function is called under rcu_read_lock(). key is the rcu protected
848e43473b7SVivek Goyal  * pointer. That means "key" is a valid throtl_data pointer as long as we are
849e43473b7SVivek Goyal  * rcu read lock.
850e43473b7SVivek Goyal  *
851e43473b7SVivek Goyal  * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
852e43473b7SVivek Goyal  * it should not be NULL as even if queue was going away, cgroup deltion
853e43473b7SVivek Goyal  * path got to it first.
854e43473b7SVivek Goyal  */
855e43473b7SVivek Goyal void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
856e43473b7SVivek Goyal {
857e43473b7SVivek Goyal 	unsigned long flags;
858e43473b7SVivek Goyal 	struct throtl_data *td = key;
859e43473b7SVivek Goyal 
860e43473b7SVivek Goyal 	spin_lock_irqsave(td->queue->queue_lock, flags);
861e43473b7SVivek Goyal 	throtl_destroy_tg(td, tg_of_blkg(blkg));
862e43473b7SVivek Goyal 	spin_unlock_irqrestore(td->queue->queue_lock, flags);
863e43473b7SVivek Goyal }
864e43473b7SVivek Goyal 
865fe071437SVivek Goyal /*
866fe071437SVivek Goyal  * For all update functions, key should be a valid pointer because these
867fe071437SVivek Goyal  * update functions are called under blkcg_lock, that means, blkg is
868fe071437SVivek Goyal  * valid and in turn key is valid. queue exit path can not race becuase
869fe071437SVivek Goyal  * of blkcg_lock
870fe071437SVivek Goyal  *
871fe071437SVivek Goyal  * Can not take queue lock in update functions as queue lock under blkcg_lock
872fe071437SVivek Goyal  * is not allowed. Under other paths we take blkcg_lock under queue_lock.
873fe071437SVivek Goyal  */
874fe071437SVivek Goyal static void throtl_update_blkio_group_read_bps(void *key,
875fe071437SVivek Goyal 				struct blkio_group *blkg, u64 read_bps)
876e43473b7SVivek Goyal {
877fe071437SVivek Goyal 	struct throtl_data *td = key;
878fe071437SVivek Goyal 
879e43473b7SVivek Goyal 	tg_of_blkg(blkg)->bps[READ] = read_bps;
880fe071437SVivek Goyal 	/* Make sure read_bps is updated before setting limits_changed */
881fe071437SVivek Goyal 	smp_wmb();
882fe071437SVivek Goyal 	tg_of_blkg(blkg)->limits_changed = true;
883fe071437SVivek Goyal 
884fe071437SVivek Goyal 	/* Make sure tg->limits_changed is updated before td->limits_changed */
885fe071437SVivek Goyal 	smp_mb__before_atomic_inc();
886fe071437SVivek Goyal 	atomic_inc(&td->limits_changed);
887fe071437SVivek Goyal 	smp_mb__after_atomic_inc();
888fe071437SVivek Goyal 
889fe071437SVivek Goyal 	/* Schedule a work now to process the limit change */
890fe071437SVivek Goyal 	throtl_schedule_delayed_work(td->queue, 0);
891e43473b7SVivek Goyal }
892e43473b7SVivek Goyal 
893fe071437SVivek Goyal static void throtl_update_blkio_group_write_bps(void *key,
894fe071437SVivek Goyal 				struct blkio_group *blkg, u64 write_bps)
895e43473b7SVivek Goyal {
896fe071437SVivek Goyal 	struct throtl_data *td = key;
897fe071437SVivek Goyal 
898e43473b7SVivek Goyal 	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
899fe071437SVivek Goyal 	smp_wmb();
900fe071437SVivek Goyal 	tg_of_blkg(blkg)->limits_changed = true;
901fe071437SVivek Goyal 	smp_mb__before_atomic_inc();
902fe071437SVivek Goyal 	atomic_inc(&td->limits_changed);
903fe071437SVivek Goyal 	smp_mb__after_atomic_inc();
904fe071437SVivek Goyal 	throtl_schedule_delayed_work(td->queue, 0);
905e43473b7SVivek Goyal }
906e43473b7SVivek Goyal 
907fe071437SVivek Goyal static void throtl_update_blkio_group_read_iops(void *key,
908fe071437SVivek Goyal 			struct blkio_group *blkg, unsigned int read_iops)
9098e89d13fSVivek Goyal {
910fe071437SVivek Goyal 	struct throtl_data *td = key;
911fe071437SVivek Goyal 
9128e89d13fSVivek Goyal 	tg_of_blkg(blkg)->iops[READ] = read_iops;
913fe071437SVivek Goyal 	smp_wmb();
914fe071437SVivek Goyal 	tg_of_blkg(blkg)->limits_changed = true;
915fe071437SVivek Goyal 	smp_mb__before_atomic_inc();
916fe071437SVivek Goyal 	atomic_inc(&td->limits_changed);
917fe071437SVivek Goyal 	smp_mb__after_atomic_inc();
918fe071437SVivek Goyal 	throtl_schedule_delayed_work(td->queue, 0);
9198e89d13fSVivek Goyal }
9208e89d13fSVivek Goyal 
921fe071437SVivek Goyal static void throtl_update_blkio_group_write_iops(void *key,
922fe071437SVivek Goyal 			struct blkio_group *blkg, unsigned int write_iops)
9238e89d13fSVivek Goyal {
924fe071437SVivek Goyal 	struct throtl_data *td = key;
925fe071437SVivek Goyal 
9268e89d13fSVivek Goyal 	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
927fe071437SVivek Goyal 	smp_wmb();
928fe071437SVivek Goyal 	tg_of_blkg(blkg)->limits_changed = true;
929fe071437SVivek Goyal 	smp_mb__before_atomic_inc();
930fe071437SVivek Goyal 	atomic_inc(&td->limits_changed);
931fe071437SVivek Goyal 	smp_mb__after_atomic_inc();
932fe071437SVivek Goyal 	throtl_schedule_delayed_work(td->queue, 0);
9338e89d13fSVivek Goyal }
9348e89d13fSVivek Goyal 
935e43473b7SVivek Goyal void throtl_shutdown_timer_wq(struct request_queue *q)
936e43473b7SVivek Goyal {
937e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
938e43473b7SVivek Goyal 
939e43473b7SVivek Goyal 	cancel_delayed_work_sync(&td->throtl_work);
940e43473b7SVivek Goyal }
941e43473b7SVivek Goyal 
942e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = {
943e43473b7SVivek Goyal 	.ops = {
944e43473b7SVivek Goyal 		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
945e43473b7SVivek Goyal 		.blkio_update_group_read_bps_fn =
946e43473b7SVivek Goyal 					throtl_update_blkio_group_read_bps,
947e43473b7SVivek Goyal 		.blkio_update_group_write_bps_fn =
948e43473b7SVivek Goyal 					throtl_update_blkio_group_write_bps,
9498e89d13fSVivek Goyal 		.blkio_update_group_read_iops_fn =
9508e89d13fSVivek Goyal 					throtl_update_blkio_group_read_iops,
9518e89d13fSVivek Goyal 		.blkio_update_group_write_iops_fn =
9528e89d13fSVivek Goyal 					throtl_update_blkio_group_write_iops,
953e43473b7SVivek Goyal 	},
9548e89d13fSVivek Goyal 	.plid = BLKIO_POLICY_THROTL,
955e43473b7SVivek Goyal };
956e43473b7SVivek Goyal 
957e43473b7SVivek Goyal int blk_throtl_bio(struct request_queue *q, struct bio **biop)
958e43473b7SVivek Goyal {
959e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
960e43473b7SVivek Goyal 	struct throtl_grp *tg;
961e43473b7SVivek Goyal 	struct bio *bio = *biop;
962e43473b7SVivek Goyal 	bool rw = bio_data_dir(bio), update_disptime = true;
963e43473b7SVivek Goyal 
964e43473b7SVivek Goyal 	if (bio->bi_rw & REQ_THROTTLED) {
965e43473b7SVivek Goyal 		bio->bi_rw &= ~REQ_THROTTLED;
966e43473b7SVivek Goyal 		return 0;
967e43473b7SVivek Goyal 	}
968e43473b7SVivek Goyal 
969e43473b7SVivek Goyal 	spin_lock_irq(q->queue_lock);
970e43473b7SVivek Goyal 	tg = throtl_get_tg(td);
971e43473b7SVivek Goyal 
972e43473b7SVivek Goyal 	if (tg->nr_queued[rw]) {
973e43473b7SVivek Goyal 		/*
974e43473b7SVivek Goyal 		 * There is already another bio queued in same dir. No
975e43473b7SVivek Goyal 		 * need to update dispatch time.
976fe071437SVivek Goyal 		 * Still update the disptime if rate limits on this group
977fe071437SVivek Goyal 		 * were changed.
978e43473b7SVivek Goyal 		 */
979fe071437SVivek Goyal 		if (!tg->limits_changed)
980e43473b7SVivek Goyal 			update_disptime = false;
981fe071437SVivek Goyal 		else
982fe071437SVivek Goyal 			tg->limits_changed = false;
983fe071437SVivek Goyal 
984e43473b7SVivek Goyal 		goto queue_bio;
985e43473b7SVivek Goyal 	}
986e43473b7SVivek Goyal 
987e43473b7SVivek Goyal 	/* Bio is with-in rate limit of group */
988e43473b7SVivek Goyal 	if (tg_may_dispatch(td, tg, bio, NULL)) {
989e43473b7SVivek Goyal 		throtl_charge_bio(tg, bio);
990e43473b7SVivek Goyal 		goto out;
991e43473b7SVivek Goyal 	}
992e43473b7SVivek Goyal 
993e43473b7SVivek Goyal queue_bio:
9948e89d13fSVivek Goyal 	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
9958e89d13fSVivek Goyal 			" iodisp=%u iops=%u queued=%d/%d",
9968e89d13fSVivek Goyal 			rw == READ ? 'R' : 'W',
997e43473b7SVivek Goyal 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
9988e89d13fSVivek Goyal 			tg->io_disp[rw], tg->iops[rw],
999e43473b7SVivek Goyal 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
1000e43473b7SVivek Goyal 
1001e43473b7SVivek Goyal 	throtl_add_bio_tg(q->td, tg, bio);
1002e43473b7SVivek Goyal 	*biop = NULL;
1003e43473b7SVivek Goyal 
1004e43473b7SVivek Goyal 	if (update_disptime) {
1005e43473b7SVivek Goyal 		tg_update_disptime(td, tg);
1006e43473b7SVivek Goyal 		throtl_schedule_next_dispatch(td);
1007e43473b7SVivek Goyal 	}
1008e43473b7SVivek Goyal 
1009e43473b7SVivek Goyal out:
1010e43473b7SVivek Goyal 	spin_unlock_irq(q->queue_lock);
1011e43473b7SVivek Goyal 	return 0;
1012e43473b7SVivek Goyal }
1013e43473b7SVivek Goyal 
1014e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q)
1015e43473b7SVivek Goyal {
1016e43473b7SVivek Goyal 	struct throtl_data *td;
1017e43473b7SVivek Goyal 	struct throtl_grp *tg;
1018e43473b7SVivek Goyal 
1019e43473b7SVivek Goyal 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1020e43473b7SVivek Goyal 	if (!td)
1021e43473b7SVivek Goyal 		return -ENOMEM;
1022e43473b7SVivek Goyal 
1023e43473b7SVivek Goyal 	INIT_HLIST_HEAD(&td->tg_list);
1024e43473b7SVivek Goyal 	td->tg_service_tree = THROTL_RB_ROOT;
1025fe071437SVivek Goyal 	atomic_set(&td->limits_changed, 0);
1026e43473b7SVivek Goyal 
1027e43473b7SVivek Goyal 	/* Init root group */
1028e43473b7SVivek Goyal 	tg = &td->root_tg;
1029e43473b7SVivek Goyal 	INIT_HLIST_NODE(&tg->tg_node);
1030e43473b7SVivek Goyal 	RB_CLEAR_NODE(&tg->rb_node);
1031e43473b7SVivek Goyal 	bio_list_init(&tg->bio_lists[0]);
1032e43473b7SVivek Goyal 	bio_list_init(&tg->bio_lists[1]);
1033e43473b7SVivek Goyal 
1034e43473b7SVivek Goyal 	/* Practically unlimited BW */
1035e43473b7SVivek Goyal 	tg->bps[0] = tg->bps[1] = -1;
10368e89d13fSVivek Goyal 	tg->iops[0] = tg->iops[1] = -1;
103702977e4aSVivek Goyal 
103802977e4aSVivek Goyal 	/*
103902977e4aSVivek Goyal 	 * Set root group reference to 2. One reference will be dropped when
104002977e4aSVivek Goyal 	 * all groups on tg_list are being deleted during queue exit. Other
104102977e4aSVivek Goyal 	 * reference will remain there as we don't want to delete this group
104202977e4aSVivek Goyal 	 * as it is statically allocated and gets destroyed when throtl_data
104302977e4aSVivek Goyal 	 * goes away.
104402977e4aSVivek Goyal 	 */
104502977e4aSVivek Goyal 	atomic_set(&tg->ref, 2);
104602977e4aSVivek Goyal 	hlist_add_head(&tg->tg_node, &td->tg_list);
104702977e4aSVivek Goyal 	td->nr_undestroyed_grps++;
1048e43473b7SVivek Goyal 
1049e43473b7SVivek Goyal 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1050e43473b7SVivek Goyal 
1051e43473b7SVivek Goyal 	rcu_read_lock();
1052e43473b7SVivek Goyal 	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1053e43473b7SVivek Goyal 					0, BLKIO_POLICY_THROTL);
1054e43473b7SVivek Goyal 	rcu_read_unlock();
1055e43473b7SVivek Goyal 
1056e43473b7SVivek Goyal 	/* Attach throtl data to request queue */
1057e43473b7SVivek Goyal 	td->queue = q;
1058e43473b7SVivek Goyal 	q->td = td;
1059e43473b7SVivek Goyal 	return 0;
1060e43473b7SVivek Goyal }
1061e43473b7SVivek Goyal 
1062e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q)
1063e43473b7SVivek Goyal {
1064e43473b7SVivek Goyal 	struct throtl_data *td = q->td;
1065e43473b7SVivek Goyal 	bool wait = false;
1066e43473b7SVivek Goyal 
1067e43473b7SVivek Goyal 	BUG_ON(!td);
1068e43473b7SVivek Goyal 
1069e43473b7SVivek Goyal 	throtl_shutdown_timer_wq(q);
1070e43473b7SVivek Goyal 
1071e43473b7SVivek Goyal 	spin_lock_irq(q->queue_lock);
1072e43473b7SVivek Goyal 	throtl_release_tgs(td);
1073e43473b7SVivek Goyal 
1074e43473b7SVivek Goyal 	/* If there are other groups */
107502977e4aSVivek Goyal 	if (td->nr_undestroyed_grps > 0)
1076e43473b7SVivek Goyal 		wait = true;
1077e43473b7SVivek Goyal 
1078e43473b7SVivek Goyal 	spin_unlock_irq(q->queue_lock);
1079e43473b7SVivek Goyal 
1080e43473b7SVivek Goyal 	/*
1081e43473b7SVivek Goyal 	 * Wait for tg->blkg->key accessors to exit their grace periods.
1082e43473b7SVivek Goyal 	 * Do this wait only if there are other undestroyed groups out
1083e43473b7SVivek Goyal 	 * there (other than root group). This can happen if cgroup deletion
1084e43473b7SVivek Goyal 	 * path claimed the responsibility of cleaning up a group before
1085e43473b7SVivek Goyal 	 * queue cleanup code get to the group.
1086e43473b7SVivek Goyal 	 *
1087e43473b7SVivek Goyal 	 * Do not call synchronize_rcu() unconditionally as there are drivers
1088e43473b7SVivek Goyal 	 * which create/delete request queue hundreds of times during scan/boot
1089e43473b7SVivek Goyal 	 * and synchronize_rcu() can take significant time and slow down boot.
1090e43473b7SVivek Goyal 	 */
1091e43473b7SVivek Goyal 	if (wait)
1092e43473b7SVivek Goyal 		synchronize_rcu();
1093fe071437SVivek Goyal 
1094fe071437SVivek Goyal 	/*
1095fe071437SVivek Goyal 	 * Just being safe to make sure after previous flush if some body did
1096fe071437SVivek Goyal 	 * update limits through cgroup and another work got queued, cancel
1097fe071437SVivek Goyal 	 * it.
1098fe071437SVivek Goyal 	 */
1099fe071437SVivek Goyal 	throtl_shutdown_timer_wq(q);
1100e43473b7SVivek Goyal 	throtl_td_free(td);
1101e43473b7SVivek Goyal }
1102e43473b7SVivek Goyal 
1103e43473b7SVivek Goyal static int __init throtl_init(void)
1104e43473b7SVivek Goyal {
1105e43473b7SVivek Goyal 	blkio_policy_register(&blkio_policy_throtl);
1106e43473b7SVivek Goyal 	return 0;
1107e43473b7SVivek Goyal }
1108e43473b7SVivek Goyal 
1109e43473b7SVivek Goyal module_init(throtl_init);
1110