xref: /linux/block/blk-mq.c (revision 3ef28e83ab15799742e55fd13243a5f678b04242)
175bb4625SJens Axboe /*
275bb4625SJens Axboe  * Block multiqueue core code
375bb4625SJens Axboe  *
475bb4625SJens Axboe  * Copyright (C) 2013-2014 Jens Axboe
575bb4625SJens Axboe  * Copyright (C) 2013-2014 Christoph Hellwig
675bb4625SJens Axboe  */
7320ae51fSJens Axboe #include <linux/kernel.h>
8320ae51fSJens Axboe #include <linux/module.h>
9320ae51fSJens Axboe #include <linux/backing-dev.h>
10320ae51fSJens Axboe #include <linux/bio.h>
11320ae51fSJens Axboe #include <linux/blkdev.h>
12f75782e4SCatalin Marinas #include <linux/kmemleak.h>
13320ae51fSJens Axboe #include <linux/mm.h>
14320ae51fSJens Axboe #include <linux/init.h>
15320ae51fSJens Axboe #include <linux/slab.h>
16320ae51fSJens Axboe #include <linux/workqueue.h>
17320ae51fSJens Axboe #include <linux/smp.h>
18320ae51fSJens Axboe #include <linux/llist.h>
19320ae51fSJens Axboe #include <linux/list_sort.h>
20320ae51fSJens Axboe #include <linux/cpu.h>
21320ae51fSJens Axboe #include <linux/cache.h>
22320ae51fSJens Axboe #include <linux/sched/sysctl.h>
23320ae51fSJens Axboe #include <linux/delay.h>
24aedcd72fSJens Axboe #include <linux/crash_dump.h>
25320ae51fSJens Axboe 
26320ae51fSJens Axboe #include <trace/events/block.h>
27320ae51fSJens Axboe 
28320ae51fSJens Axboe #include <linux/blk-mq.h>
29320ae51fSJens Axboe #include "blk.h"
30320ae51fSJens Axboe #include "blk-mq.h"
31320ae51fSJens Axboe #include "blk-mq-tag.h"
32320ae51fSJens Axboe 
33320ae51fSJens Axboe static DEFINE_MUTEX(all_q_mutex);
34320ae51fSJens Axboe static LIST_HEAD(all_q_list);
35320ae51fSJens Axboe 
36320ae51fSJens Axboe static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
37320ae51fSJens Axboe 
38320ae51fSJens Axboe /*
39320ae51fSJens Axboe  * Check if any of the ctx's have pending work in this hardware queue
40320ae51fSJens Axboe  */
41320ae51fSJens Axboe static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
42320ae51fSJens Axboe {
43320ae51fSJens Axboe 	unsigned int i;
44320ae51fSJens Axboe 
45569fd0ceSJens Axboe 	for (i = 0; i < hctx->ctx_map.size; i++)
461429d7c9SJens Axboe 		if (hctx->ctx_map.map[i].word)
47320ae51fSJens Axboe 			return true;
48320ae51fSJens Axboe 
49320ae51fSJens Axboe 	return false;
50320ae51fSJens Axboe }
51320ae51fSJens Axboe 
521429d7c9SJens Axboe static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
531429d7c9SJens Axboe 					      struct blk_mq_ctx *ctx)
541429d7c9SJens Axboe {
551429d7c9SJens Axboe 	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
561429d7c9SJens Axboe }
571429d7c9SJens Axboe 
581429d7c9SJens Axboe #define CTX_TO_BIT(hctx, ctx)	\
591429d7c9SJens Axboe 	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
601429d7c9SJens Axboe 
61320ae51fSJens Axboe /*
62320ae51fSJens Axboe  * Mark this ctx as having pending work in this hardware queue
63320ae51fSJens Axboe  */
64320ae51fSJens Axboe static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
65320ae51fSJens Axboe 				     struct blk_mq_ctx *ctx)
66320ae51fSJens Axboe {
671429d7c9SJens Axboe 	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
681429d7c9SJens Axboe 
691429d7c9SJens Axboe 	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
701429d7c9SJens Axboe 		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
711429d7c9SJens Axboe }
721429d7c9SJens Axboe 
731429d7c9SJens Axboe static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
741429d7c9SJens Axboe 				      struct blk_mq_ctx *ctx)
751429d7c9SJens Axboe {
761429d7c9SJens Axboe 	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
771429d7c9SJens Axboe 
781429d7c9SJens Axboe 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
79320ae51fSJens Axboe }
80320ae51fSJens Axboe 
81b4c6a028SKeith Busch void blk_mq_freeze_queue_start(struct request_queue *q)
8243a5e4e2SMing Lei {
834ecd4fefSChristoph Hellwig 	int freeze_depth;
84cddd5d17STejun Heo 
854ecd4fefSChristoph Hellwig 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
864ecd4fefSChristoph Hellwig 	if (freeze_depth == 1) {
873ef28e83SDan Williams 		percpu_ref_kill(&q->q_usage_counter);
88b94ec296SMike Snitzer 		blk_mq_run_hw_queues(q, false);
89cddd5d17STejun Heo 	}
90f3af020bSTejun Heo }
91b4c6a028SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
92f3af020bSTejun Heo 
93f3af020bSTejun Heo static void blk_mq_freeze_queue_wait(struct request_queue *q)
94f3af020bSTejun Heo {
953ef28e83SDan Williams 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
9643a5e4e2SMing Lei }
9743a5e4e2SMing Lei 
98f3af020bSTejun Heo /*
99f3af020bSTejun Heo  * Guarantee no request is in use, so we can change any data structure of
100f3af020bSTejun Heo  * the queue afterward.
101f3af020bSTejun Heo  */
1023ef28e83SDan Williams void blk_freeze_queue(struct request_queue *q)
103f3af020bSTejun Heo {
1043ef28e83SDan Williams 	/*
1053ef28e83SDan Williams 	 * In the !blk_mq case we are only calling this to kill the
1063ef28e83SDan Williams 	 * q_usage_counter, otherwise this increases the freeze depth
1073ef28e83SDan Williams 	 * and waits for it to return to zero.  For this reason there is
1083ef28e83SDan Williams 	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
1093ef28e83SDan Williams 	 * exported to drivers as the only user for unfreeze is blk_mq.
1103ef28e83SDan Williams 	 */
111f3af020bSTejun Heo 	blk_mq_freeze_queue_start(q);
112f3af020bSTejun Heo 	blk_mq_freeze_queue_wait(q);
113f3af020bSTejun Heo }
1143ef28e83SDan Williams 
1153ef28e83SDan Williams void blk_mq_freeze_queue(struct request_queue *q)
1163ef28e83SDan Williams {
1173ef28e83SDan Williams 	/*
1183ef28e83SDan Williams 	 * ...just an alias to keep freeze and unfreeze actions balanced
1193ef28e83SDan Williams 	 * in the blk_mq_* namespace
1203ef28e83SDan Williams 	 */
1213ef28e83SDan Williams 	blk_freeze_queue(q);
1223ef28e83SDan Williams }
123c761d96bSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
124f3af020bSTejun Heo 
125b4c6a028SKeith Busch void blk_mq_unfreeze_queue(struct request_queue *q)
126320ae51fSJens Axboe {
1274ecd4fefSChristoph Hellwig 	int freeze_depth;
128320ae51fSJens Axboe 
1294ecd4fefSChristoph Hellwig 	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
1304ecd4fefSChristoph Hellwig 	WARN_ON_ONCE(freeze_depth < 0);
1314ecd4fefSChristoph Hellwig 	if (!freeze_depth) {
1323ef28e83SDan Williams 		percpu_ref_reinit(&q->q_usage_counter);
133320ae51fSJens Axboe 		wake_up_all(&q->mq_freeze_wq);
134320ae51fSJens Axboe 	}
135add703fdSTejun Heo }
136b4c6a028SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
137320ae51fSJens Axboe 
138aed3ea94SJens Axboe void blk_mq_wake_waiters(struct request_queue *q)
139aed3ea94SJens Axboe {
140aed3ea94SJens Axboe 	struct blk_mq_hw_ctx *hctx;
141aed3ea94SJens Axboe 	unsigned int i;
142aed3ea94SJens Axboe 
143aed3ea94SJens Axboe 	queue_for_each_hw_ctx(q, hctx, i)
144aed3ea94SJens Axboe 		if (blk_mq_hw_queue_mapped(hctx))
145aed3ea94SJens Axboe 			blk_mq_tag_wakeup_all(hctx->tags, true);
1463fd5940cSKeith Busch 
1473fd5940cSKeith Busch 	/*
1483fd5940cSKeith Busch 	 * If we are called because the queue has now been marked as
1493fd5940cSKeith Busch 	 * dying, we need to ensure that processes currently waiting on
1503fd5940cSKeith Busch 	 * the queue are notified as well.
1513fd5940cSKeith Busch 	 */
1523fd5940cSKeith Busch 	wake_up_all(&q->mq_freeze_wq);
153aed3ea94SJens Axboe }
154aed3ea94SJens Axboe 
155320ae51fSJens Axboe bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
156320ae51fSJens Axboe {
157320ae51fSJens Axboe 	return blk_mq_has_free_tags(hctx->tags);
158320ae51fSJens Axboe }
159320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_can_queue);
160320ae51fSJens Axboe 
16194eddfbeSJens Axboe static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
16294eddfbeSJens Axboe 			       struct request *rq, unsigned int rw_flags)
163320ae51fSJens Axboe {
16494eddfbeSJens Axboe 	if (blk_queue_io_stat(q))
16594eddfbeSJens Axboe 		rw_flags |= REQ_IO_STAT;
16694eddfbeSJens Axboe 
167af76e555SChristoph Hellwig 	INIT_LIST_HEAD(&rq->queuelist);
168af76e555SChristoph Hellwig 	/* csd/requeue_work/fifo_time is initialized before use */
169af76e555SChristoph Hellwig 	rq->q = q;
170320ae51fSJens Axboe 	rq->mq_ctx = ctx;
1710d2602caSJens Axboe 	rq->cmd_flags |= rw_flags;
172af76e555SChristoph Hellwig 	/* do not touch atomic flags, it needs atomic ops against the timer */
173af76e555SChristoph Hellwig 	rq->cpu = -1;
174af76e555SChristoph Hellwig 	INIT_HLIST_NODE(&rq->hash);
175af76e555SChristoph Hellwig 	RB_CLEAR_NODE(&rq->rb_node);
176af76e555SChristoph Hellwig 	rq->rq_disk = NULL;
177af76e555SChristoph Hellwig 	rq->part = NULL;
1783ee32372SJens Axboe 	rq->start_time = jiffies;
179af76e555SChristoph Hellwig #ifdef CONFIG_BLK_CGROUP
180af76e555SChristoph Hellwig 	rq->rl = NULL;
1810fec08b4SMing Lei 	set_start_time_ns(rq);
182af76e555SChristoph Hellwig 	rq->io_start_time_ns = 0;
183af76e555SChristoph Hellwig #endif
184af76e555SChristoph Hellwig 	rq->nr_phys_segments = 0;
185af76e555SChristoph Hellwig #if defined(CONFIG_BLK_DEV_INTEGRITY)
186af76e555SChristoph Hellwig 	rq->nr_integrity_segments = 0;
187af76e555SChristoph Hellwig #endif
188af76e555SChristoph Hellwig 	rq->special = NULL;
189af76e555SChristoph Hellwig 	/* tag was already set */
190af76e555SChristoph Hellwig 	rq->errors = 0;
191af76e555SChristoph Hellwig 
1926f4a1626STony Battersby 	rq->cmd = rq->__cmd;
1936f4a1626STony Battersby 
194af76e555SChristoph Hellwig 	rq->extra_len = 0;
195af76e555SChristoph Hellwig 	rq->sense_len = 0;
196af76e555SChristoph Hellwig 	rq->resid_len = 0;
197af76e555SChristoph Hellwig 	rq->sense = NULL;
198af76e555SChristoph Hellwig 
199af76e555SChristoph Hellwig 	INIT_LIST_HEAD(&rq->timeout_list);
200f6be4fb4SJens Axboe 	rq->timeout = 0;
201f6be4fb4SJens Axboe 
202af76e555SChristoph Hellwig 	rq->end_io = NULL;
203af76e555SChristoph Hellwig 	rq->end_io_data = NULL;
204af76e555SChristoph Hellwig 	rq->next_rq = NULL;
205af76e555SChristoph Hellwig 
206320ae51fSJens Axboe 	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
207320ae51fSJens Axboe }
208320ae51fSJens Axboe 
2095dee8577SChristoph Hellwig static struct request *
210cb96a42cSMing Lei __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
2115dee8577SChristoph Hellwig {
2125dee8577SChristoph Hellwig 	struct request *rq;
2135dee8577SChristoph Hellwig 	unsigned int tag;
2145dee8577SChristoph Hellwig 
215cb96a42cSMing Lei 	tag = blk_mq_get_tag(data);
2165dee8577SChristoph Hellwig 	if (tag != BLK_MQ_TAG_FAIL) {
217cb96a42cSMing Lei 		rq = data->hctx->tags->rqs[tag];
2185dee8577SChristoph Hellwig 
219cb96a42cSMing Lei 		if (blk_mq_tag_busy(data->hctx)) {
2205dee8577SChristoph Hellwig 			rq->cmd_flags = REQ_MQ_INFLIGHT;
221cb96a42cSMing Lei 			atomic_inc(&data->hctx->nr_active);
2225dee8577SChristoph Hellwig 		}
2235dee8577SChristoph Hellwig 
2245dee8577SChristoph Hellwig 		rq->tag = tag;
225cb96a42cSMing Lei 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
2265dee8577SChristoph Hellwig 		return rq;
2275dee8577SChristoph Hellwig 	}
2285dee8577SChristoph Hellwig 
2295dee8577SChristoph Hellwig 	return NULL;
2305dee8577SChristoph Hellwig }
2315dee8577SChristoph Hellwig 
2324ce01dd1SChristoph Hellwig struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
2334ce01dd1SChristoph Hellwig 		bool reserved)
234320ae51fSJens Axboe {
235d852564fSChristoph Hellwig 	struct blk_mq_ctx *ctx;
236d852564fSChristoph Hellwig 	struct blk_mq_hw_ctx *hctx;
237320ae51fSJens Axboe 	struct request *rq;
238cb96a42cSMing Lei 	struct blk_mq_alloc_data alloc_data;
239a492f075SJoe Lawrence 	int ret;
240320ae51fSJens Axboe 
2413ef28e83SDan Williams 	ret = blk_queue_enter(q, gfp);
242a492f075SJoe Lawrence 	if (ret)
243a492f075SJoe Lawrence 		return ERR_PTR(ret);
244320ae51fSJens Axboe 
245d852564fSChristoph Hellwig 	ctx = blk_mq_get_ctx(q);
246d852564fSChristoph Hellwig 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
247cb96a42cSMing Lei 	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
248cb96a42cSMing Lei 			reserved, ctx, hctx);
249d852564fSChristoph Hellwig 
250cb96a42cSMing Lei 	rq = __blk_mq_alloc_request(&alloc_data, rw);
251d852564fSChristoph Hellwig 	if (!rq && (gfp & __GFP_WAIT)) {
252d852564fSChristoph Hellwig 		__blk_mq_run_hw_queue(hctx);
253d852564fSChristoph Hellwig 		blk_mq_put_ctx(ctx);
254d852564fSChristoph Hellwig 
255d852564fSChristoph Hellwig 		ctx = blk_mq_get_ctx(q);
256d852564fSChristoph Hellwig 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
257cb96a42cSMing Lei 		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
258cb96a42cSMing Lei 				hctx);
259cb96a42cSMing Lei 		rq =  __blk_mq_alloc_request(&alloc_data, rw);
260cb96a42cSMing Lei 		ctx = alloc_data.ctx;
261d852564fSChristoph Hellwig 	}
262d852564fSChristoph Hellwig 	blk_mq_put_ctx(ctx);
263c76541a9SKeith Busch 	if (!rq) {
2643ef28e83SDan Williams 		blk_queue_exit(q);
265a492f075SJoe Lawrence 		return ERR_PTR(-EWOULDBLOCK);
266c76541a9SKeith Busch 	}
267320ae51fSJens Axboe 	return rq;
268320ae51fSJens Axboe }
2694bb659b1SJens Axboe EXPORT_SYMBOL(blk_mq_alloc_request);
270320ae51fSJens Axboe 
271320ae51fSJens Axboe static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
272320ae51fSJens Axboe 				  struct blk_mq_ctx *ctx, struct request *rq)
273320ae51fSJens Axboe {
274320ae51fSJens Axboe 	const int tag = rq->tag;
275320ae51fSJens Axboe 	struct request_queue *q = rq->q;
276320ae51fSJens Axboe 
2770d2602caSJens Axboe 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
2780d2602caSJens Axboe 		atomic_dec(&hctx->nr_active);
279683d0e12SDavid Hildenbrand 	rq->cmd_flags = 0;
2800d2602caSJens Axboe 
281af76e555SChristoph Hellwig 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
2820d2602caSJens Axboe 	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
2833ef28e83SDan Williams 	blk_queue_exit(q);
284320ae51fSJens Axboe }
285320ae51fSJens Axboe 
2867c7f2f2bSJens Axboe void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
287320ae51fSJens Axboe {
288320ae51fSJens Axboe 	struct blk_mq_ctx *ctx = rq->mq_ctx;
2897c7f2f2bSJens Axboe 
2907c7f2f2bSJens Axboe 	ctx->rq_completed[rq_is_sync(rq)]++;
2917c7f2f2bSJens Axboe 	__blk_mq_free_request(hctx, ctx, rq);
2927c7f2f2bSJens Axboe 
2937c7f2f2bSJens Axboe }
2947c7f2f2bSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
2957c7f2f2bSJens Axboe 
2967c7f2f2bSJens Axboe void blk_mq_free_request(struct request *rq)
2977c7f2f2bSJens Axboe {
298320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
299320ae51fSJens Axboe 	struct request_queue *q = rq->q;
300320ae51fSJens Axboe 
3017c7f2f2bSJens Axboe 	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
3027c7f2f2bSJens Axboe 	blk_mq_free_hctx_request(hctx, rq);
303320ae51fSJens Axboe }
3041a3b595aSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_free_request);
305320ae51fSJens Axboe 
306c8a446adSChristoph Hellwig inline void __blk_mq_end_request(struct request *rq, int error)
307320ae51fSJens Axboe {
3080d11e6acSMing Lei 	blk_account_io_done(rq);
3090d11e6acSMing Lei 
31091b63639SChristoph Hellwig 	if (rq->end_io) {
311320ae51fSJens Axboe 		rq->end_io(rq, error);
31291b63639SChristoph Hellwig 	} else {
31391b63639SChristoph Hellwig 		if (unlikely(blk_bidi_rq(rq)))
31491b63639SChristoph Hellwig 			blk_mq_free_request(rq->next_rq);
315320ae51fSJens Axboe 		blk_mq_free_request(rq);
316320ae51fSJens Axboe 	}
31791b63639SChristoph Hellwig }
318c8a446adSChristoph Hellwig EXPORT_SYMBOL(__blk_mq_end_request);
31963151a44SChristoph Hellwig 
320c8a446adSChristoph Hellwig void blk_mq_end_request(struct request *rq, int error)
32163151a44SChristoph Hellwig {
32263151a44SChristoph Hellwig 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
32363151a44SChristoph Hellwig 		BUG();
324c8a446adSChristoph Hellwig 	__blk_mq_end_request(rq, error);
32563151a44SChristoph Hellwig }
326c8a446adSChristoph Hellwig EXPORT_SYMBOL(blk_mq_end_request);
327320ae51fSJens Axboe 
32830a91cb4SChristoph Hellwig static void __blk_mq_complete_request_remote(void *data)
329320ae51fSJens Axboe {
3303d6efbf6SChristoph Hellwig 	struct request *rq = data;
331320ae51fSJens Axboe 
33230a91cb4SChristoph Hellwig 	rq->q->softirq_done_fn(rq);
333320ae51fSJens Axboe }
334320ae51fSJens Axboe 
335ed851860SJens Axboe static void blk_mq_ipi_complete_request(struct request *rq)
336320ae51fSJens Axboe {
337320ae51fSJens Axboe 	struct blk_mq_ctx *ctx = rq->mq_ctx;
33838535201SChristoph Hellwig 	bool shared = false;
339320ae51fSJens Axboe 	int cpu;
340320ae51fSJens Axboe 
34138535201SChristoph Hellwig 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
34230a91cb4SChristoph Hellwig 		rq->q->softirq_done_fn(rq);
34330a91cb4SChristoph Hellwig 		return;
34430a91cb4SChristoph Hellwig 	}
345320ae51fSJens Axboe 
346320ae51fSJens Axboe 	cpu = get_cpu();
34738535201SChristoph Hellwig 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
34838535201SChristoph Hellwig 		shared = cpus_share_cache(cpu, ctx->cpu);
34938535201SChristoph Hellwig 
35038535201SChristoph Hellwig 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
35130a91cb4SChristoph Hellwig 		rq->csd.func = __blk_mq_complete_request_remote;
3523d6efbf6SChristoph Hellwig 		rq->csd.info = rq;
3533d6efbf6SChristoph Hellwig 		rq->csd.flags = 0;
354c46fff2aSFrederic Weisbecker 		smp_call_function_single_async(ctx->cpu, &rq->csd);
3553d6efbf6SChristoph Hellwig 	} else {
35630a91cb4SChristoph Hellwig 		rq->q->softirq_done_fn(rq);
3573d6efbf6SChristoph Hellwig 	}
358320ae51fSJens Axboe 	put_cpu();
359320ae51fSJens Axboe }
36030a91cb4SChristoph Hellwig 
361ed851860SJens Axboe void __blk_mq_complete_request(struct request *rq)
362ed851860SJens Axboe {
363ed851860SJens Axboe 	struct request_queue *q = rq->q;
364ed851860SJens Axboe 
365ed851860SJens Axboe 	if (!q->softirq_done_fn)
366c8a446adSChristoph Hellwig 		blk_mq_end_request(rq, rq->errors);
367ed851860SJens Axboe 	else
368ed851860SJens Axboe 		blk_mq_ipi_complete_request(rq);
369ed851860SJens Axboe }
370ed851860SJens Axboe 
37130a91cb4SChristoph Hellwig /**
37230a91cb4SChristoph Hellwig  * blk_mq_complete_request - end I/O on a request
37330a91cb4SChristoph Hellwig  * @rq:		the request being processed
37430a91cb4SChristoph Hellwig  *
37530a91cb4SChristoph Hellwig  * Description:
37630a91cb4SChristoph Hellwig  *	Ends all I/O on a request. It does not handle partial completions.
37730a91cb4SChristoph Hellwig  *	The actual completion happens out-of-order, through a IPI handler.
37830a91cb4SChristoph Hellwig  **/
379f4829a9bSChristoph Hellwig void blk_mq_complete_request(struct request *rq, int error)
38030a91cb4SChristoph Hellwig {
38195f09684SJens Axboe 	struct request_queue *q = rq->q;
38295f09684SJens Axboe 
38395f09684SJens Axboe 	if (unlikely(blk_should_fake_timeout(q)))
38430a91cb4SChristoph Hellwig 		return;
385f4829a9bSChristoph Hellwig 	if (!blk_mark_rq_complete(rq)) {
386f4829a9bSChristoph Hellwig 		rq->errors = error;
38730a91cb4SChristoph Hellwig 		__blk_mq_complete_request(rq);
38830a91cb4SChristoph Hellwig 	}
389f4829a9bSChristoph Hellwig }
39030a91cb4SChristoph Hellwig EXPORT_SYMBOL(blk_mq_complete_request);
391320ae51fSJens Axboe 
392973c0191SKeith Busch int blk_mq_request_started(struct request *rq)
393973c0191SKeith Busch {
394973c0191SKeith Busch 	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
395973c0191SKeith Busch }
396973c0191SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_request_started);
397973c0191SKeith Busch 
398e2490073SChristoph Hellwig void blk_mq_start_request(struct request *rq)
399320ae51fSJens Axboe {
400320ae51fSJens Axboe 	struct request_queue *q = rq->q;
401320ae51fSJens Axboe 
402320ae51fSJens Axboe 	trace_block_rq_issue(q, rq);
403320ae51fSJens Axboe 
404742ee69bSChristoph Hellwig 	rq->resid_len = blk_rq_bytes(rq);
40591b63639SChristoph Hellwig 	if (unlikely(blk_bidi_rq(rq)))
40691b63639SChristoph Hellwig 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
407742ee69bSChristoph Hellwig 
4082b8393b4SMing Lei 	blk_add_timer(rq);
40987ee7b11SJens Axboe 
41087ee7b11SJens Axboe 	/*
411538b7534SJens Axboe 	 * Ensure that ->deadline is visible before set the started
412538b7534SJens Axboe 	 * flag and clear the completed flag.
413538b7534SJens Axboe 	 */
414538b7534SJens Axboe 	smp_mb__before_atomic();
415538b7534SJens Axboe 
416538b7534SJens Axboe 	/*
41787ee7b11SJens Axboe 	 * Mark us as started and clear complete. Complete might have been
41887ee7b11SJens Axboe 	 * set if requeue raced with timeout, which then marked it as
41987ee7b11SJens Axboe 	 * complete. So be sure to clear complete again when we start
42087ee7b11SJens Axboe 	 * the request, otherwise we'll ignore the completion event.
42187ee7b11SJens Axboe 	 */
4224b570521SJens Axboe 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
423320ae51fSJens Axboe 		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
4244b570521SJens Axboe 	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
42587ee7b11SJens Axboe 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
42649f5baa5SChristoph Hellwig 
42749f5baa5SChristoph Hellwig 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
42849f5baa5SChristoph Hellwig 		/*
42949f5baa5SChristoph Hellwig 		 * Make sure space for the drain appears.  We know we can do
43049f5baa5SChristoph Hellwig 		 * this because max_hw_segments has been adjusted to be one
43149f5baa5SChristoph Hellwig 		 * fewer than the device can handle.
43249f5baa5SChristoph Hellwig 		 */
43349f5baa5SChristoph Hellwig 		rq->nr_phys_segments++;
43449f5baa5SChristoph Hellwig 	}
435320ae51fSJens Axboe }
436e2490073SChristoph Hellwig EXPORT_SYMBOL(blk_mq_start_request);
437320ae51fSJens Axboe 
438ed0791b2SChristoph Hellwig static void __blk_mq_requeue_request(struct request *rq)
439320ae51fSJens Axboe {
440320ae51fSJens Axboe 	struct request_queue *q = rq->q;
441320ae51fSJens Axboe 
442320ae51fSJens Axboe 	trace_block_rq_requeue(q, rq);
44349f5baa5SChristoph Hellwig 
444e2490073SChristoph Hellwig 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
44549f5baa5SChristoph Hellwig 		if (q->dma_drain_size && blk_rq_bytes(rq))
44649f5baa5SChristoph Hellwig 			rq->nr_phys_segments--;
447320ae51fSJens Axboe 	}
448e2490073SChristoph Hellwig }
449320ae51fSJens Axboe 
450ed0791b2SChristoph Hellwig void blk_mq_requeue_request(struct request *rq)
451ed0791b2SChristoph Hellwig {
452ed0791b2SChristoph Hellwig 	__blk_mq_requeue_request(rq);
453ed0791b2SChristoph Hellwig 
454ed0791b2SChristoph Hellwig 	BUG_ON(blk_queued_rq(rq));
4556fca6a61SChristoph Hellwig 	blk_mq_add_to_requeue_list(rq, true);
456ed0791b2SChristoph Hellwig }
457ed0791b2SChristoph Hellwig EXPORT_SYMBOL(blk_mq_requeue_request);
458ed0791b2SChristoph Hellwig 
4596fca6a61SChristoph Hellwig static void blk_mq_requeue_work(struct work_struct *work)
4606fca6a61SChristoph Hellwig {
4616fca6a61SChristoph Hellwig 	struct request_queue *q =
4626fca6a61SChristoph Hellwig 		container_of(work, struct request_queue, requeue_work);
4636fca6a61SChristoph Hellwig 	LIST_HEAD(rq_list);
4646fca6a61SChristoph Hellwig 	struct request *rq, *next;
4656fca6a61SChristoph Hellwig 	unsigned long flags;
4666fca6a61SChristoph Hellwig 
4676fca6a61SChristoph Hellwig 	spin_lock_irqsave(&q->requeue_lock, flags);
4686fca6a61SChristoph Hellwig 	list_splice_init(&q->requeue_list, &rq_list);
4696fca6a61SChristoph Hellwig 	spin_unlock_irqrestore(&q->requeue_lock, flags);
4706fca6a61SChristoph Hellwig 
4716fca6a61SChristoph Hellwig 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
4726fca6a61SChristoph Hellwig 		if (!(rq->cmd_flags & REQ_SOFTBARRIER))
4736fca6a61SChristoph Hellwig 			continue;
4746fca6a61SChristoph Hellwig 
4756fca6a61SChristoph Hellwig 		rq->cmd_flags &= ~REQ_SOFTBARRIER;
4766fca6a61SChristoph Hellwig 		list_del_init(&rq->queuelist);
4776fca6a61SChristoph Hellwig 		blk_mq_insert_request(rq, true, false, false);
4786fca6a61SChristoph Hellwig 	}
4796fca6a61SChristoph Hellwig 
4806fca6a61SChristoph Hellwig 	while (!list_empty(&rq_list)) {
4816fca6a61SChristoph Hellwig 		rq = list_entry(rq_list.next, struct request, queuelist);
4826fca6a61SChristoph Hellwig 		list_del_init(&rq->queuelist);
4836fca6a61SChristoph Hellwig 		blk_mq_insert_request(rq, false, false, false);
4846fca6a61SChristoph Hellwig 	}
4856fca6a61SChristoph Hellwig 
4868b957415SJens Axboe 	/*
4878b957415SJens Axboe 	 * Use the start variant of queue running here, so that running
4888b957415SJens Axboe 	 * the requeue work will kick stopped queues.
4898b957415SJens Axboe 	 */
4908b957415SJens Axboe 	blk_mq_start_hw_queues(q);
4916fca6a61SChristoph Hellwig }
4926fca6a61SChristoph Hellwig 
4936fca6a61SChristoph Hellwig void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
4946fca6a61SChristoph Hellwig {
4956fca6a61SChristoph Hellwig 	struct request_queue *q = rq->q;
4966fca6a61SChristoph Hellwig 	unsigned long flags;
4976fca6a61SChristoph Hellwig 
4986fca6a61SChristoph Hellwig 	/*
4996fca6a61SChristoph Hellwig 	 * We abuse this flag that is otherwise used by the I/O scheduler to
5006fca6a61SChristoph Hellwig 	 * request head insertation from the workqueue.
5016fca6a61SChristoph Hellwig 	 */
5026fca6a61SChristoph Hellwig 	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
5036fca6a61SChristoph Hellwig 
5046fca6a61SChristoph Hellwig 	spin_lock_irqsave(&q->requeue_lock, flags);
5056fca6a61SChristoph Hellwig 	if (at_head) {
5066fca6a61SChristoph Hellwig 		rq->cmd_flags |= REQ_SOFTBARRIER;
5076fca6a61SChristoph Hellwig 		list_add(&rq->queuelist, &q->requeue_list);
5086fca6a61SChristoph Hellwig 	} else {
5096fca6a61SChristoph Hellwig 		list_add_tail(&rq->queuelist, &q->requeue_list);
5106fca6a61SChristoph Hellwig 	}
5116fca6a61SChristoph Hellwig 	spin_unlock_irqrestore(&q->requeue_lock, flags);
5126fca6a61SChristoph Hellwig }
5136fca6a61SChristoph Hellwig EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
5146fca6a61SChristoph Hellwig 
515c68ed59fSKeith Busch void blk_mq_cancel_requeue_work(struct request_queue *q)
516c68ed59fSKeith Busch {
517c68ed59fSKeith Busch 	cancel_work_sync(&q->requeue_work);
518c68ed59fSKeith Busch }
519c68ed59fSKeith Busch EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
520c68ed59fSKeith Busch 
5216fca6a61SChristoph Hellwig void blk_mq_kick_requeue_list(struct request_queue *q)
5226fca6a61SChristoph Hellwig {
5236fca6a61SChristoph Hellwig 	kblockd_schedule_work(&q->requeue_work);
5246fca6a61SChristoph Hellwig }
5256fca6a61SChristoph Hellwig EXPORT_SYMBOL(blk_mq_kick_requeue_list);
5266fca6a61SChristoph Hellwig 
5271885b24dSJens Axboe void blk_mq_abort_requeue_list(struct request_queue *q)
5281885b24dSJens Axboe {
5291885b24dSJens Axboe 	unsigned long flags;
5301885b24dSJens Axboe 	LIST_HEAD(rq_list);
5311885b24dSJens Axboe 
5321885b24dSJens Axboe 	spin_lock_irqsave(&q->requeue_lock, flags);
5331885b24dSJens Axboe 	list_splice_init(&q->requeue_list, &rq_list);
5341885b24dSJens Axboe 	spin_unlock_irqrestore(&q->requeue_lock, flags);
5351885b24dSJens Axboe 
5361885b24dSJens Axboe 	while (!list_empty(&rq_list)) {
5371885b24dSJens Axboe 		struct request *rq;
5381885b24dSJens Axboe 
5391885b24dSJens Axboe 		rq = list_first_entry(&rq_list, struct request, queuelist);
5401885b24dSJens Axboe 		list_del_init(&rq->queuelist);
5411885b24dSJens Axboe 		rq->errors = -EIO;
5421885b24dSJens Axboe 		blk_mq_end_request(rq, rq->errors);
5431885b24dSJens Axboe 	}
5441885b24dSJens Axboe }
5451885b24dSJens Axboe EXPORT_SYMBOL(blk_mq_abort_requeue_list);
5461885b24dSJens Axboe 
5470e62f51fSJens Axboe struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
5480e62f51fSJens Axboe {
5490048b483SMing Lei 	return tags->rqs[tag];
55024d2f903SChristoph Hellwig }
55124d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_tag_to_rq);
55224d2f903SChristoph Hellwig 
553320ae51fSJens Axboe struct blk_mq_timeout_data {
55446f92d42SChristoph Hellwig 	unsigned long next;
55546f92d42SChristoph Hellwig 	unsigned int next_set;
556320ae51fSJens Axboe };
557320ae51fSJens Axboe 
55890415837SChristoph Hellwig void blk_mq_rq_timed_out(struct request *req, bool reserved)
559320ae51fSJens Axboe {
56046f92d42SChristoph Hellwig 	struct blk_mq_ops *ops = req->q->mq_ops;
56146f92d42SChristoph Hellwig 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
56287ee7b11SJens Axboe 
56387ee7b11SJens Axboe 	/*
56487ee7b11SJens Axboe 	 * We know that complete is set at this point. If STARTED isn't set
56587ee7b11SJens Axboe 	 * anymore, then the request isn't active and the "timeout" should
56687ee7b11SJens Axboe 	 * just be ignored. This can happen due to the bitflag ordering.
56787ee7b11SJens Axboe 	 * Timeout first checks if STARTED is set, and if it is, assumes
56887ee7b11SJens Axboe 	 * the request is active. But if we race with completion, then
56987ee7b11SJens Axboe 	 * we both flags will get cleared. So check here again, and ignore
57087ee7b11SJens Axboe 	 * a timeout event with a request that isn't active.
57187ee7b11SJens Axboe 	 */
57246f92d42SChristoph Hellwig 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
57346f92d42SChristoph Hellwig 		return;
57487ee7b11SJens Axboe 
57546f92d42SChristoph Hellwig 	if (ops->timeout)
5760152fb6bSChristoph Hellwig 		ret = ops->timeout(req, reserved);
57787ee7b11SJens Axboe 
57846f92d42SChristoph Hellwig 	switch (ret) {
57946f92d42SChristoph Hellwig 	case BLK_EH_HANDLED:
58046f92d42SChristoph Hellwig 		__blk_mq_complete_request(req);
58146f92d42SChristoph Hellwig 		break;
58246f92d42SChristoph Hellwig 	case BLK_EH_RESET_TIMER:
58346f92d42SChristoph Hellwig 		blk_add_timer(req);
58446f92d42SChristoph Hellwig 		blk_clear_rq_complete(req);
58546f92d42SChristoph Hellwig 		break;
58646f92d42SChristoph Hellwig 	case BLK_EH_NOT_HANDLED:
58746f92d42SChristoph Hellwig 		break;
58846f92d42SChristoph Hellwig 	default:
58946f92d42SChristoph Hellwig 		printk(KERN_ERR "block: bad eh return: %d\n", ret);
59046f92d42SChristoph Hellwig 		break;
59187ee7b11SJens Axboe 	}
59287ee7b11SJens Axboe }
59387ee7b11SJens Axboe 
59481481eb4SChristoph Hellwig static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
59581481eb4SChristoph Hellwig 		struct request *rq, void *priv, bool reserved)
596320ae51fSJens Axboe {
59781481eb4SChristoph Hellwig 	struct blk_mq_timeout_data *data = priv;
59881481eb4SChristoph Hellwig 
599eb130dbfSKeith Busch 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
600eb130dbfSKeith Busch 		/*
601eb130dbfSKeith Busch 		 * If a request wasn't started before the queue was
602eb130dbfSKeith Busch 		 * marked dying, kill it here or it'll go unnoticed.
603eb130dbfSKeith Busch 		 */
604f4829a9bSChristoph Hellwig 		if (unlikely(blk_queue_dying(rq->q)))
605f4829a9bSChristoph Hellwig 			blk_mq_complete_request(rq, -EIO);
6065b3f25fcSKeith Busch 		return;
607eb130dbfSKeith Busch 	}
6085b3f25fcSKeith Busch 	if (rq->cmd_flags & REQ_NO_TIMEOUT)
60946f92d42SChristoph Hellwig 		return;
61046f92d42SChristoph Hellwig 
61146f92d42SChristoph Hellwig 	if (time_after_eq(jiffies, rq->deadline)) {
61246f92d42SChristoph Hellwig 		if (!blk_mark_rq_complete(rq))
6130152fb6bSChristoph Hellwig 			blk_mq_rq_timed_out(rq, reserved);
61446f92d42SChristoph Hellwig 	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
61546f92d42SChristoph Hellwig 		data->next = rq->deadline;
61646f92d42SChristoph Hellwig 		data->next_set = 1;
61746f92d42SChristoph Hellwig 	}
61881481eb4SChristoph Hellwig }
61981481eb4SChristoph Hellwig 
62081481eb4SChristoph Hellwig static void blk_mq_rq_timer(unsigned long priv)
62181481eb4SChristoph Hellwig {
62281481eb4SChristoph Hellwig 	struct request_queue *q = (struct request_queue *)priv;
62381481eb4SChristoph Hellwig 	struct blk_mq_timeout_data data = {
62481481eb4SChristoph Hellwig 		.next		= 0,
62581481eb4SChristoph Hellwig 		.next_set	= 0,
62681481eb4SChristoph Hellwig 	};
62781481eb4SChristoph Hellwig 	int i;
628320ae51fSJens Axboe 
6290bf6cd5bSChristoph Hellwig 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
630320ae51fSJens Axboe 
63181481eb4SChristoph Hellwig 	if (data.next_set) {
63281481eb4SChristoph Hellwig 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
63381481eb4SChristoph Hellwig 		mod_timer(&q->timeout, data.next);
6340d2602caSJens Axboe 	} else {
6350bf6cd5bSChristoph Hellwig 		struct blk_mq_hw_ctx *hctx;
6360bf6cd5bSChristoph Hellwig 
637f054b56cSMing Lei 		queue_for_each_hw_ctx(q, hctx, i) {
638f054b56cSMing Lei 			/* the hctx may be unmapped, so check it here */
639f054b56cSMing Lei 			if (blk_mq_hw_queue_mapped(hctx))
6400d2602caSJens Axboe 				blk_mq_tag_idle(hctx);
6410d2602caSJens Axboe 		}
642320ae51fSJens Axboe 	}
643f054b56cSMing Lei }
644320ae51fSJens Axboe 
645320ae51fSJens Axboe /*
646320ae51fSJens Axboe  * Reverse check our software queue for entries that we could potentially
647320ae51fSJens Axboe  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
648320ae51fSJens Axboe  * too much time checking for merges.
649320ae51fSJens Axboe  */
650320ae51fSJens Axboe static bool blk_mq_attempt_merge(struct request_queue *q,
651320ae51fSJens Axboe 				 struct blk_mq_ctx *ctx, struct bio *bio)
652320ae51fSJens Axboe {
653320ae51fSJens Axboe 	struct request *rq;
654320ae51fSJens Axboe 	int checked = 8;
655320ae51fSJens Axboe 
656320ae51fSJens Axboe 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
657320ae51fSJens Axboe 		int el_ret;
658320ae51fSJens Axboe 
659320ae51fSJens Axboe 		if (!checked--)
660320ae51fSJens Axboe 			break;
661320ae51fSJens Axboe 
662320ae51fSJens Axboe 		if (!blk_rq_merge_ok(rq, bio))
663320ae51fSJens Axboe 			continue;
664320ae51fSJens Axboe 
665320ae51fSJens Axboe 		el_ret = blk_try_merge(rq, bio);
666320ae51fSJens Axboe 		if (el_ret == ELEVATOR_BACK_MERGE) {
667320ae51fSJens Axboe 			if (bio_attempt_back_merge(q, rq, bio)) {
668320ae51fSJens Axboe 				ctx->rq_merged++;
669320ae51fSJens Axboe 				return true;
670320ae51fSJens Axboe 			}
671320ae51fSJens Axboe 			break;
672320ae51fSJens Axboe 		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
673320ae51fSJens Axboe 			if (bio_attempt_front_merge(q, rq, bio)) {
674320ae51fSJens Axboe 				ctx->rq_merged++;
675320ae51fSJens Axboe 				return true;
676320ae51fSJens Axboe 			}
677320ae51fSJens Axboe 			break;
678320ae51fSJens Axboe 		}
679320ae51fSJens Axboe 	}
680320ae51fSJens Axboe 
681320ae51fSJens Axboe 	return false;
682320ae51fSJens Axboe }
683320ae51fSJens Axboe 
684320ae51fSJens Axboe /*
6851429d7c9SJens Axboe  * Process software queues that have been marked busy, splicing them
6861429d7c9SJens Axboe  * to the for-dispatch
6871429d7c9SJens Axboe  */
6881429d7c9SJens Axboe static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
6891429d7c9SJens Axboe {
6901429d7c9SJens Axboe 	struct blk_mq_ctx *ctx;
6911429d7c9SJens Axboe 	int i;
6921429d7c9SJens Axboe 
693569fd0ceSJens Axboe 	for (i = 0; i < hctx->ctx_map.size; i++) {
6941429d7c9SJens Axboe 		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
6951429d7c9SJens Axboe 		unsigned int off, bit;
6961429d7c9SJens Axboe 
6971429d7c9SJens Axboe 		if (!bm->word)
6981429d7c9SJens Axboe 			continue;
6991429d7c9SJens Axboe 
7001429d7c9SJens Axboe 		bit = 0;
7011429d7c9SJens Axboe 		off = i * hctx->ctx_map.bits_per_word;
7021429d7c9SJens Axboe 		do {
7031429d7c9SJens Axboe 			bit = find_next_bit(&bm->word, bm->depth, bit);
7041429d7c9SJens Axboe 			if (bit >= bm->depth)
7051429d7c9SJens Axboe 				break;
7061429d7c9SJens Axboe 
7071429d7c9SJens Axboe 			ctx = hctx->ctxs[bit + off];
7081429d7c9SJens Axboe 			clear_bit(bit, &bm->word);
7091429d7c9SJens Axboe 			spin_lock(&ctx->lock);
7101429d7c9SJens Axboe 			list_splice_tail_init(&ctx->rq_list, list);
7111429d7c9SJens Axboe 			spin_unlock(&ctx->lock);
7121429d7c9SJens Axboe 
7131429d7c9SJens Axboe 			bit++;
7141429d7c9SJens Axboe 		} while (1);
7151429d7c9SJens Axboe 	}
7161429d7c9SJens Axboe }
7171429d7c9SJens Axboe 
7181429d7c9SJens Axboe /*
719320ae51fSJens Axboe  * Run this hardware queue, pulling any software queues mapped to it in.
720320ae51fSJens Axboe  * Note that this function currently has various problems around ordering
721320ae51fSJens Axboe  * of IO. In particular, we'd like FIFO behaviour on handling existing
722320ae51fSJens Axboe  * items on the hctx->dispatch list. Ignore that for now.
723320ae51fSJens Axboe  */
724320ae51fSJens Axboe static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
725320ae51fSJens Axboe {
726320ae51fSJens Axboe 	struct request_queue *q = hctx->queue;
727320ae51fSJens Axboe 	struct request *rq;
728320ae51fSJens Axboe 	LIST_HEAD(rq_list);
72974c45052SJens Axboe 	LIST_HEAD(driver_list);
73074c45052SJens Axboe 	struct list_head *dptr;
7311429d7c9SJens Axboe 	int queued;
732320ae51fSJens Axboe 
733fd1270d5SJens Axboe 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
734e4043dcfSJens Axboe 
7355d12f905SJens Axboe 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
736320ae51fSJens Axboe 		return;
737320ae51fSJens Axboe 
738320ae51fSJens Axboe 	hctx->run++;
739320ae51fSJens Axboe 
740320ae51fSJens Axboe 	/*
741320ae51fSJens Axboe 	 * Touch any software queue that has pending entries.
742320ae51fSJens Axboe 	 */
7431429d7c9SJens Axboe 	flush_busy_ctxs(hctx, &rq_list);
744320ae51fSJens Axboe 
745320ae51fSJens Axboe 	/*
746320ae51fSJens Axboe 	 * If we have previous entries on our dispatch list, grab them
747320ae51fSJens Axboe 	 * and stuff them at the front for more fair dispatch.
748320ae51fSJens Axboe 	 */
749320ae51fSJens Axboe 	if (!list_empty_careful(&hctx->dispatch)) {
750320ae51fSJens Axboe 		spin_lock(&hctx->lock);
751320ae51fSJens Axboe 		if (!list_empty(&hctx->dispatch))
752320ae51fSJens Axboe 			list_splice_init(&hctx->dispatch, &rq_list);
753320ae51fSJens Axboe 		spin_unlock(&hctx->lock);
754320ae51fSJens Axboe 	}
755320ae51fSJens Axboe 
756320ae51fSJens Axboe 	/*
75774c45052SJens Axboe 	 * Start off with dptr being NULL, so we start the first request
75874c45052SJens Axboe 	 * immediately, even if we have more pending.
75974c45052SJens Axboe 	 */
76074c45052SJens Axboe 	dptr = NULL;
76174c45052SJens Axboe 
76274c45052SJens Axboe 	/*
763320ae51fSJens Axboe 	 * Now process all the entries, sending them to the driver.
764320ae51fSJens Axboe 	 */
7651429d7c9SJens Axboe 	queued = 0;
766320ae51fSJens Axboe 	while (!list_empty(&rq_list)) {
76774c45052SJens Axboe 		struct blk_mq_queue_data bd;
768320ae51fSJens Axboe 		int ret;
769320ae51fSJens Axboe 
770320ae51fSJens Axboe 		rq = list_first_entry(&rq_list, struct request, queuelist);
771320ae51fSJens Axboe 		list_del_init(&rq->queuelist);
772320ae51fSJens Axboe 
77374c45052SJens Axboe 		bd.rq = rq;
77474c45052SJens Axboe 		bd.list = dptr;
77574c45052SJens Axboe 		bd.last = list_empty(&rq_list);
77674c45052SJens Axboe 
77774c45052SJens Axboe 		ret = q->mq_ops->queue_rq(hctx, &bd);
778320ae51fSJens Axboe 		switch (ret) {
779320ae51fSJens Axboe 		case BLK_MQ_RQ_QUEUE_OK:
780320ae51fSJens Axboe 			queued++;
781320ae51fSJens Axboe 			continue;
782320ae51fSJens Axboe 		case BLK_MQ_RQ_QUEUE_BUSY:
783320ae51fSJens Axboe 			list_add(&rq->queuelist, &rq_list);
784ed0791b2SChristoph Hellwig 			__blk_mq_requeue_request(rq);
785320ae51fSJens Axboe 			break;
786320ae51fSJens Axboe 		default:
787320ae51fSJens Axboe 			pr_err("blk-mq: bad return on queue: %d\n", ret);
788320ae51fSJens Axboe 		case BLK_MQ_RQ_QUEUE_ERROR:
7891e93b8c2SChristoph Hellwig 			rq->errors = -EIO;
790c8a446adSChristoph Hellwig 			blk_mq_end_request(rq, rq->errors);
791320ae51fSJens Axboe 			break;
792320ae51fSJens Axboe 		}
793320ae51fSJens Axboe 
794320ae51fSJens Axboe 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
795320ae51fSJens Axboe 			break;
79674c45052SJens Axboe 
79774c45052SJens Axboe 		/*
79874c45052SJens Axboe 		 * We've done the first request. If we have more than 1
79974c45052SJens Axboe 		 * left in the list, set dptr to defer issue.
80074c45052SJens Axboe 		 */
80174c45052SJens Axboe 		if (!dptr && rq_list.next != rq_list.prev)
80274c45052SJens Axboe 			dptr = &driver_list;
803320ae51fSJens Axboe 	}
804320ae51fSJens Axboe 
805320ae51fSJens Axboe 	if (!queued)
806320ae51fSJens Axboe 		hctx->dispatched[0]++;
807320ae51fSJens Axboe 	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
808320ae51fSJens Axboe 		hctx->dispatched[ilog2(queued) + 1]++;
809320ae51fSJens Axboe 
810320ae51fSJens Axboe 	/*
811320ae51fSJens Axboe 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
812320ae51fSJens Axboe 	 * that is where we will continue on next queue run.
813320ae51fSJens Axboe 	 */
814320ae51fSJens Axboe 	if (!list_empty(&rq_list)) {
815320ae51fSJens Axboe 		spin_lock(&hctx->lock);
816320ae51fSJens Axboe 		list_splice(&rq_list, &hctx->dispatch);
817320ae51fSJens Axboe 		spin_unlock(&hctx->lock);
8189ba52e58SShaohua Li 		/*
8199ba52e58SShaohua Li 		 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
8209ba52e58SShaohua Li 		 * it's possible the queue is stopped and restarted again
8219ba52e58SShaohua Li 		 * before this. Queue restart will dispatch requests. And since
8229ba52e58SShaohua Li 		 * requests in rq_list aren't added into hctx->dispatch yet,
8239ba52e58SShaohua Li 		 * the requests in rq_list might get lost.
8249ba52e58SShaohua Li 		 *
8259ba52e58SShaohua Li 		 * blk_mq_run_hw_queue() already checks the STOPPED bit
8269ba52e58SShaohua Li 		 **/
8279ba52e58SShaohua Li 		blk_mq_run_hw_queue(hctx, true);
828320ae51fSJens Axboe 	}
829320ae51fSJens Axboe }
830320ae51fSJens Axboe 
831506e931fSJens Axboe /*
832506e931fSJens Axboe  * It'd be great if the workqueue API had a way to pass
833506e931fSJens Axboe  * in a mask and had some smarts for more clever placement.
834506e931fSJens Axboe  * For now we just round-robin here, switching for every
835506e931fSJens Axboe  * BLK_MQ_CPU_WORK_BATCH queued items.
836506e931fSJens Axboe  */
837506e931fSJens Axboe static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
838506e931fSJens Axboe {
839b657d7e6SChristoph Hellwig 	if (hctx->queue->nr_hw_queues == 1)
840b657d7e6SChristoph Hellwig 		return WORK_CPU_UNBOUND;
841506e931fSJens Axboe 
842506e931fSJens Axboe 	if (--hctx->next_cpu_batch <= 0) {
843b657d7e6SChristoph Hellwig 		int cpu = hctx->next_cpu, next_cpu;
844506e931fSJens Axboe 
845506e931fSJens Axboe 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
846506e931fSJens Axboe 		if (next_cpu >= nr_cpu_ids)
847506e931fSJens Axboe 			next_cpu = cpumask_first(hctx->cpumask);
848506e931fSJens Axboe 
849506e931fSJens Axboe 		hctx->next_cpu = next_cpu;
850506e931fSJens Axboe 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
851506e931fSJens Axboe 
852506e931fSJens Axboe 		return cpu;
853506e931fSJens Axboe 	}
854506e931fSJens Axboe 
855b657d7e6SChristoph Hellwig 	return hctx->next_cpu;
856b657d7e6SChristoph Hellwig }
857b657d7e6SChristoph Hellwig 
858320ae51fSJens Axboe void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
859320ae51fSJens Axboe {
86019c66e59SMing Lei 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
86119c66e59SMing Lei 	    !blk_mq_hw_queue_mapped(hctx)))
862320ae51fSJens Axboe 		return;
863320ae51fSJens Axboe 
864398205b8SPaolo Bonzini 	if (!async) {
8652a90d4aaSPaolo Bonzini 		int cpu = get_cpu();
8662a90d4aaSPaolo Bonzini 		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
867320ae51fSJens Axboe 			__blk_mq_run_hw_queue(hctx);
8682a90d4aaSPaolo Bonzini 			put_cpu();
869398205b8SPaolo Bonzini 			return;
870e4043dcfSJens Axboe 		}
871398205b8SPaolo Bonzini 
8722a90d4aaSPaolo Bonzini 		put_cpu();
873398205b8SPaolo Bonzini 	}
874398205b8SPaolo Bonzini 
875b657d7e6SChristoph Hellwig 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
876b657d7e6SChristoph Hellwig 			&hctx->run_work, 0);
877320ae51fSJens Axboe }
878320ae51fSJens Axboe 
879b94ec296SMike Snitzer void blk_mq_run_hw_queues(struct request_queue *q, bool async)
880320ae51fSJens Axboe {
881320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
882320ae51fSJens Axboe 	int i;
883320ae51fSJens Axboe 
884320ae51fSJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
885320ae51fSJens Axboe 		if ((!blk_mq_hctx_has_pending(hctx) &&
886320ae51fSJens Axboe 		    list_empty_careful(&hctx->dispatch)) ||
8875d12f905SJens Axboe 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
888320ae51fSJens Axboe 			continue;
889320ae51fSJens Axboe 
890b94ec296SMike Snitzer 		blk_mq_run_hw_queue(hctx, async);
891320ae51fSJens Axboe 	}
892320ae51fSJens Axboe }
893b94ec296SMike Snitzer EXPORT_SYMBOL(blk_mq_run_hw_queues);
894320ae51fSJens Axboe 
895320ae51fSJens Axboe void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
896320ae51fSJens Axboe {
89770f4db63SChristoph Hellwig 	cancel_delayed_work(&hctx->run_work);
89870f4db63SChristoph Hellwig 	cancel_delayed_work(&hctx->delay_work);
899320ae51fSJens Axboe 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
900320ae51fSJens Axboe }
901320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_stop_hw_queue);
902320ae51fSJens Axboe 
903280d45f6SChristoph Hellwig void blk_mq_stop_hw_queues(struct request_queue *q)
904280d45f6SChristoph Hellwig {
905280d45f6SChristoph Hellwig 	struct blk_mq_hw_ctx *hctx;
906280d45f6SChristoph Hellwig 	int i;
907280d45f6SChristoph Hellwig 
908280d45f6SChristoph Hellwig 	queue_for_each_hw_ctx(q, hctx, i)
909280d45f6SChristoph Hellwig 		blk_mq_stop_hw_queue(hctx);
910280d45f6SChristoph Hellwig }
911280d45f6SChristoph Hellwig EXPORT_SYMBOL(blk_mq_stop_hw_queues);
912280d45f6SChristoph Hellwig 
913320ae51fSJens Axboe void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
914320ae51fSJens Axboe {
915320ae51fSJens Axboe 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
916e4043dcfSJens Axboe 
9170ffbce80SJens Axboe 	blk_mq_run_hw_queue(hctx, false);
918320ae51fSJens Axboe }
919320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_start_hw_queue);
920320ae51fSJens Axboe 
9212f268556SChristoph Hellwig void blk_mq_start_hw_queues(struct request_queue *q)
9222f268556SChristoph Hellwig {
9232f268556SChristoph Hellwig 	struct blk_mq_hw_ctx *hctx;
9242f268556SChristoph Hellwig 	int i;
9252f268556SChristoph Hellwig 
9262f268556SChristoph Hellwig 	queue_for_each_hw_ctx(q, hctx, i)
9272f268556SChristoph Hellwig 		blk_mq_start_hw_queue(hctx);
9282f268556SChristoph Hellwig }
9292f268556SChristoph Hellwig EXPORT_SYMBOL(blk_mq_start_hw_queues);
9302f268556SChristoph Hellwig 
9311b4a3258SChristoph Hellwig void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
932320ae51fSJens Axboe {
933320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
934320ae51fSJens Axboe 	int i;
935320ae51fSJens Axboe 
936320ae51fSJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
937320ae51fSJens Axboe 		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
938320ae51fSJens Axboe 			continue;
939320ae51fSJens Axboe 
940320ae51fSJens Axboe 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
9411b4a3258SChristoph Hellwig 		blk_mq_run_hw_queue(hctx, async);
942320ae51fSJens Axboe 	}
943320ae51fSJens Axboe }
944320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
945320ae51fSJens Axboe 
94670f4db63SChristoph Hellwig static void blk_mq_run_work_fn(struct work_struct *work)
947320ae51fSJens Axboe {
948320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
949320ae51fSJens Axboe 
95070f4db63SChristoph Hellwig 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
951e4043dcfSJens Axboe 
952320ae51fSJens Axboe 	__blk_mq_run_hw_queue(hctx);
953320ae51fSJens Axboe }
954320ae51fSJens Axboe 
95570f4db63SChristoph Hellwig static void blk_mq_delay_work_fn(struct work_struct *work)
95670f4db63SChristoph Hellwig {
95770f4db63SChristoph Hellwig 	struct blk_mq_hw_ctx *hctx;
95870f4db63SChristoph Hellwig 
95970f4db63SChristoph Hellwig 	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
96070f4db63SChristoph Hellwig 
96170f4db63SChristoph Hellwig 	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
96270f4db63SChristoph Hellwig 		__blk_mq_run_hw_queue(hctx);
96370f4db63SChristoph Hellwig }
96470f4db63SChristoph Hellwig 
96570f4db63SChristoph Hellwig void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
96670f4db63SChristoph Hellwig {
96719c66e59SMing Lei 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
96819c66e59SMing Lei 		return;
96970f4db63SChristoph Hellwig 
970b657d7e6SChristoph Hellwig 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
971b657d7e6SChristoph Hellwig 			&hctx->delay_work, msecs_to_jiffies(msecs));
97270f4db63SChristoph Hellwig }
97370f4db63SChristoph Hellwig EXPORT_SYMBOL(blk_mq_delay_queue);
97470f4db63SChristoph Hellwig 
975320ae51fSJens Axboe static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
97672a0a36eSChristoph Hellwig 				    struct request *rq, bool at_head)
977320ae51fSJens Axboe {
978320ae51fSJens Axboe 	struct blk_mq_ctx *ctx = rq->mq_ctx;
979320ae51fSJens Axboe 
98001b983c9SJens Axboe 	trace_block_rq_insert(hctx->queue, rq);
98101b983c9SJens Axboe 
98272a0a36eSChristoph Hellwig 	if (at_head)
98372a0a36eSChristoph Hellwig 		list_add(&rq->queuelist, &ctx->rq_list);
98472a0a36eSChristoph Hellwig 	else
985320ae51fSJens Axboe 		list_add_tail(&rq->queuelist, &ctx->rq_list);
9864bb659b1SJens Axboe 
987320ae51fSJens Axboe 	blk_mq_hctx_mark_pending(hctx, ctx);
988320ae51fSJens Axboe }
989320ae51fSJens Axboe 
990eeabc850SChristoph Hellwig void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
991eeabc850SChristoph Hellwig 		bool async)
992320ae51fSJens Axboe {
993320ae51fSJens Axboe 	struct request_queue *q = rq->q;
994320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
995eeabc850SChristoph Hellwig 	struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
996320ae51fSJens Axboe 
997320ae51fSJens Axboe 	current_ctx = blk_mq_get_ctx(q);
998eeabc850SChristoph Hellwig 	if (!cpu_online(ctx->cpu))
999eeabc850SChristoph Hellwig 		rq->mq_ctx = ctx = current_ctx;
1000320ae51fSJens Axboe 
1001320ae51fSJens Axboe 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
1002320ae51fSJens Axboe 
1003320ae51fSJens Axboe 	spin_lock(&ctx->lock);
1004eeabc850SChristoph Hellwig 	__blk_mq_insert_request(hctx, rq, at_head);
1005320ae51fSJens Axboe 	spin_unlock(&ctx->lock);
1006320ae51fSJens Axboe 
1007320ae51fSJens Axboe 	if (run_queue)
1008320ae51fSJens Axboe 		blk_mq_run_hw_queue(hctx, async);
1009e4043dcfSJens Axboe 
1010e4043dcfSJens Axboe 	blk_mq_put_ctx(current_ctx);
1011320ae51fSJens Axboe }
1012320ae51fSJens Axboe 
1013320ae51fSJens Axboe static void blk_mq_insert_requests(struct request_queue *q,
1014320ae51fSJens Axboe 				     struct blk_mq_ctx *ctx,
1015320ae51fSJens Axboe 				     struct list_head *list,
1016320ae51fSJens Axboe 				     int depth,
1017320ae51fSJens Axboe 				     bool from_schedule)
1018320ae51fSJens Axboe 
1019320ae51fSJens Axboe {
1020320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
1021320ae51fSJens Axboe 	struct blk_mq_ctx *current_ctx;
1022320ae51fSJens Axboe 
1023320ae51fSJens Axboe 	trace_block_unplug(q, depth, !from_schedule);
1024320ae51fSJens Axboe 
1025320ae51fSJens Axboe 	current_ctx = blk_mq_get_ctx(q);
1026320ae51fSJens Axboe 
1027320ae51fSJens Axboe 	if (!cpu_online(ctx->cpu))
1028320ae51fSJens Axboe 		ctx = current_ctx;
1029320ae51fSJens Axboe 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
1030320ae51fSJens Axboe 
1031320ae51fSJens Axboe 	/*
1032320ae51fSJens Axboe 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1033320ae51fSJens Axboe 	 * offline now
1034320ae51fSJens Axboe 	 */
1035320ae51fSJens Axboe 	spin_lock(&ctx->lock);
1036320ae51fSJens Axboe 	while (!list_empty(list)) {
1037320ae51fSJens Axboe 		struct request *rq;
1038320ae51fSJens Axboe 
1039320ae51fSJens Axboe 		rq = list_first_entry(list, struct request, queuelist);
1040320ae51fSJens Axboe 		list_del_init(&rq->queuelist);
1041320ae51fSJens Axboe 		rq->mq_ctx = ctx;
104272a0a36eSChristoph Hellwig 		__blk_mq_insert_request(hctx, rq, false);
1043320ae51fSJens Axboe 	}
1044320ae51fSJens Axboe 	spin_unlock(&ctx->lock);
1045320ae51fSJens Axboe 
1046320ae51fSJens Axboe 	blk_mq_run_hw_queue(hctx, from_schedule);
1047e4043dcfSJens Axboe 	blk_mq_put_ctx(current_ctx);
1048320ae51fSJens Axboe }
1049320ae51fSJens Axboe 
1050320ae51fSJens Axboe static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1051320ae51fSJens Axboe {
1052320ae51fSJens Axboe 	struct request *rqa = container_of(a, struct request, queuelist);
1053320ae51fSJens Axboe 	struct request *rqb = container_of(b, struct request, queuelist);
1054320ae51fSJens Axboe 
1055320ae51fSJens Axboe 	return !(rqa->mq_ctx < rqb->mq_ctx ||
1056320ae51fSJens Axboe 		 (rqa->mq_ctx == rqb->mq_ctx &&
1057320ae51fSJens Axboe 		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1058320ae51fSJens Axboe }
1059320ae51fSJens Axboe 
1060320ae51fSJens Axboe void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1061320ae51fSJens Axboe {
1062320ae51fSJens Axboe 	struct blk_mq_ctx *this_ctx;
1063320ae51fSJens Axboe 	struct request_queue *this_q;
1064320ae51fSJens Axboe 	struct request *rq;
1065320ae51fSJens Axboe 	LIST_HEAD(list);
1066320ae51fSJens Axboe 	LIST_HEAD(ctx_list);
1067320ae51fSJens Axboe 	unsigned int depth;
1068320ae51fSJens Axboe 
1069320ae51fSJens Axboe 	list_splice_init(&plug->mq_list, &list);
1070320ae51fSJens Axboe 
1071320ae51fSJens Axboe 	list_sort(NULL, &list, plug_ctx_cmp);
1072320ae51fSJens Axboe 
1073320ae51fSJens Axboe 	this_q = NULL;
1074320ae51fSJens Axboe 	this_ctx = NULL;
1075320ae51fSJens Axboe 	depth = 0;
1076320ae51fSJens Axboe 
1077320ae51fSJens Axboe 	while (!list_empty(&list)) {
1078320ae51fSJens Axboe 		rq = list_entry_rq(list.next);
1079320ae51fSJens Axboe 		list_del_init(&rq->queuelist);
1080320ae51fSJens Axboe 		BUG_ON(!rq->q);
1081320ae51fSJens Axboe 		if (rq->mq_ctx != this_ctx) {
1082320ae51fSJens Axboe 			if (this_ctx) {
1083320ae51fSJens Axboe 				blk_mq_insert_requests(this_q, this_ctx,
1084320ae51fSJens Axboe 							&ctx_list, depth,
1085320ae51fSJens Axboe 							from_schedule);
1086320ae51fSJens Axboe 			}
1087320ae51fSJens Axboe 
1088320ae51fSJens Axboe 			this_ctx = rq->mq_ctx;
1089320ae51fSJens Axboe 			this_q = rq->q;
1090320ae51fSJens Axboe 			depth = 0;
1091320ae51fSJens Axboe 		}
1092320ae51fSJens Axboe 
1093320ae51fSJens Axboe 		depth++;
1094320ae51fSJens Axboe 		list_add_tail(&rq->queuelist, &ctx_list);
1095320ae51fSJens Axboe 	}
1096320ae51fSJens Axboe 
1097320ae51fSJens Axboe 	/*
1098320ae51fSJens Axboe 	 * If 'this_ctx' is set, we know we have entries to complete
1099320ae51fSJens Axboe 	 * on 'ctx_list'. Do those.
1100320ae51fSJens Axboe 	 */
1101320ae51fSJens Axboe 	if (this_ctx) {
1102320ae51fSJens Axboe 		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1103320ae51fSJens Axboe 				       from_schedule);
1104320ae51fSJens Axboe 	}
1105320ae51fSJens Axboe }
1106320ae51fSJens Axboe 
1107320ae51fSJens Axboe static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1108320ae51fSJens Axboe {
1109320ae51fSJens Axboe 	init_request_from_bio(rq, bio);
11104b570521SJens Axboe 
11113ee32372SJens Axboe 	if (blk_do_io_stat(rq))
1112320ae51fSJens Axboe 		blk_account_io_start(rq, 1);
1113320ae51fSJens Axboe }
1114320ae51fSJens Axboe 
1115274a5843SJens Axboe static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1116274a5843SJens Axboe {
1117274a5843SJens Axboe 	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1118274a5843SJens Axboe 		!blk_queue_nomerges(hctx->queue);
1119274a5843SJens Axboe }
1120274a5843SJens Axboe 
112107068d5bSJens Axboe static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
112207068d5bSJens Axboe 					 struct blk_mq_ctx *ctx,
112307068d5bSJens Axboe 					 struct request *rq, struct bio *bio)
112407068d5bSJens Axboe {
1125274a5843SJens Axboe 	if (!hctx_allow_merges(hctx)) {
112607068d5bSJens Axboe 		blk_mq_bio_to_request(rq, bio);
112707068d5bSJens Axboe 		spin_lock(&ctx->lock);
112807068d5bSJens Axboe insert_rq:
112907068d5bSJens Axboe 		__blk_mq_insert_request(hctx, rq, false);
113007068d5bSJens Axboe 		spin_unlock(&ctx->lock);
113107068d5bSJens Axboe 		return false;
113207068d5bSJens Axboe 	} else {
1133274a5843SJens Axboe 		struct request_queue *q = hctx->queue;
1134274a5843SJens Axboe 
113507068d5bSJens Axboe 		spin_lock(&ctx->lock);
113607068d5bSJens Axboe 		if (!blk_mq_attempt_merge(q, ctx, bio)) {
113707068d5bSJens Axboe 			blk_mq_bio_to_request(rq, bio);
113807068d5bSJens Axboe 			goto insert_rq;
113907068d5bSJens Axboe 		}
114007068d5bSJens Axboe 
114107068d5bSJens Axboe 		spin_unlock(&ctx->lock);
114207068d5bSJens Axboe 		__blk_mq_free_request(hctx, ctx, rq);
114307068d5bSJens Axboe 		return true;
114407068d5bSJens Axboe 	}
114507068d5bSJens Axboe }
114607068d5bSJens Axboe 
114707068d5bSJens Axboe struct blk_map_ctx {
114807068d5bSJens Axboe 	struct blk_mq_hw_ctx *hctx;
114907068d5bSJens Axboe 	struct blk_mq_ctx *ctx;
115007068d5bSJens Axboe };
115107068d5bSJens Axboe 
115207068d5bSJens Axboe static struct request *blk_mq_map_request(struct request_queue *q,
115307068d5bSJens Axboe 					  struct bio *bio,
115407068d5bSJens Axboe 					  struct blk_map_ctx *data)
1155320ae51fSJens Axboe {
1156320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
1157320ae51fSJens Axboe 	struct blk_mq_ctx *ctx;
1158320ae51fSJens Axboe 	struct request *rq;
115907068d5bSJens Axboe 	int rw = bio_data_dir(bio);
1160cb96a42cSMing Lei 	struct blk_mq_alloc_data alloc_data;
1161320ae51fSJens Axboe 
11623ef28e83SDan Williams 	blk_queue_enter_live(q);
1163320ae51fSJens Axboe 	ctx = blk_mq_get_ctx(q);
1164320ae51fSJens Axboe 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
1165320ae51fSJens Axboe 
116607068d5bSJens Axboe 	if (rw_is_sync(bio->bi_rw))
116727fbf4e8SShaohua Li 		rw |= REQ_SYNC;
116807068d5bSJens Axboe 
1169320ae51fSJens Axboe 	trace_block_getrq(q, bio, rw);
1170cb96a42cSMing Lei 	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
1171cb96a42cSMing Lei 			hctx);
1172cb96a42cSMing Lei 	rq = __blk_mq_alloc_request(&alloc_data, rw);
11735dee8577SChristoph Hellwig 	if (unlikely(!rq)) {
1174793597a6SChristoph Hellwig 		__blk_mq_run_hw_queue(hctx);
1175320ae51fSJens Axboe 		blk_mq_put_ctx(ctx);
1176320ae51fSJens Axboe 		trace_block_sleeprq(q, bio, rw);
1177793597a6SChristoph Hellwig 
1178793597a6SChristoph Hellwig 		ctx = blk_mq_get_ctx(q);
1179320ae51fSJens Axboe 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
1180cb96a42cSMing Lei 		blk_mq_set_alloc_data(&alloc_data, q,
1181cb96a42cSMing Lei 				__GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
1182cb96a42cSMing Lei 		rq = __blk_mq_alloc_request(&alloc_data, rw);
1183cb96a42cSMing Lei 		ctx = alloc_data.ctx;
1184cb96a42cSMing Lei 		hctx = alloc_data.hctx;
1185320ae51fSJens Axboe 	}
1186320ae51fSJens Axboe 
1187320ae51fSJens Axboe 	hctx->queued++;
118807068d5bSJens Axboe 	data->hctx = hctx;
118907068d5bSJens Axboe 	data->ctx = ctx;
119007068d5bSJens Axboe 	return rq;
119107068d5bSJens Axboe }
119207068d5bSJens Axboe 
1193f984df1fSShaohua Li static int blk_mq_direct_issue_request(struct request *rq)
1194f984df1fSShaohua Li {
1195f984df1fSShaohua Li 	int ret;
1196f984df1fSShaohua Li 	struct request_queue *q = rq->q;
1197f984df1fSShaohua Li 	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
1198f984df1fSShaohua Li 			rq->mq_ctx->cpu);
1199f984df1fSShaohua Li 	struct blk_mq_queue_data bd = {
1200f984df1fSShaohua Li 		.rq = rq,
1201f984df1fSShaohua Li 		.list = NULL,
1202f984df1fSShaohua Li 		.last = 1
1203f984df1fSShaohua Li 	};
1204f984df1fSShaohua Li 
1205f984df1fSShaohua Li 	/*
1206f984df1fSShaohua Li 	 * For OK queue, we are done. For error, kill it. Any other
1207f984df1fSShaohua Li 	 * error (busy), just add it to our list as we previously
1208f984df1fSShaohua Li 	 * would have done
1209f984df1fSShaohua Li 	 */
1210f984df1fSShaohua Li 	ret = q->mq_ops->queue_rq(hctx, &bd);
1211f984df1fSShaohua Li 	if (ret == BLK_MQ_RQ_QUEUE_OK)
1212f984df1fSShaohua Li 		return 0;
1213f984df1fSShaohua Li 	else {
1214f984df1fSShaohua Li 		__blk_mq_requeue_request(rq);
1215f984df1fSShaohua Li 
1216f984df1fSShaohua Li 		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1217f984df1fSShaohua Li 			rq->errors = -EIO;
1218f984df1fSShaohua Li 			blk_mq_end_request(rq, rq->errors);
1219f984df1fSShaohua Li 			return 0;
1220f984df1fSShaohua Li 		}
1221f984df1fSShaohua Li 		return -1;
1222f984df1fSShaohua Li 	}
1223f984df1fSShaohua Li }
1224f984df1fSShaohua Li 
122507068d5bSJens Axboe /*
122607068d5bSJens Axboe  * Multiple hardware queue variant. This will not use per-process plugs,
122707068d5bSJens Axboe  * but will attempt to bypass the hctx queueing if we can go straight to
122807068d5bSJens Axboe  * hardware for SYNC IO.
122907068d5bSJens Axboe  */
123007068d5bSJens Axboe static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
123107068d5bSJens Axboe {
123207068d5bSJens Axboe 	const int is_sync = rw_is_sync(bio->bi_rw);
123307068d5bSJens Axboe 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
123407068d5bSJens Axboe 	struct blk_map_ctx data;
123507068d5bSJens Axboe 	struct request *rq;
1236f984df1fSShaohua Li 	unsigned int request_count = 0;
1237f984df1fSShaohua Li 	struct blk_plug *plug;
12385b3f341fSShaohua Li 	struct request *same_queue_rq = NULL;
123907068d5bSJens Axboe 
124007068d5bSJens Axboe 	blk_queue_bounce(q, &bio);
124107068d5bSJens Axboe 
124207068d5bSJens Axboe 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
12434246a0b6SChristoph Hellwig 		bio_io_error(bio);
124407068d5bSJens Axboe 		return;
124507068d5bSJens Axboe 	}
124607068d5bSJens Axboe 
124754efd50bSKent Overstreet 	blk_queue_split(q, &bio, q->bio_split);
124854efd50bSKent Overstreet 
1249f984df1fSShaohua Li 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
12505b3f341fSShaohua Li 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1251f984df1fSShaohua Li 		return;
1252f984df1fSShaohua Li 
125307068d5bSJens Axboe 	rq = blk_mq_map_request(q, bio, &data);
125407068d5bSJens Axboe 	if (unlikely(!rq))
125507068d5bSJens Axboe 		return;
125607068d5bSJens Axboe 
125707068d5bSJens Axboe 	if (unlikely(is_flush_fua)) {
125807068d5bSJens Axboe 		blk_mq_bio_to_request(rq, bio);
125907068d5bSJens Axboe 		blk_insert_flush(rq);
126007068d5bSJens Axboe 		goto run_queue;
126107068d5bSJens Axboe 	}
126207068d5bSJens Axboe 
1263f984df1fSShaohua Li 	plug = current->plug;
1264e167dfb5SJens Axboe 	/*
1265e167dfb5SJens Axboe 	 * If the driver supports defer issued based on 'last', then
1266e167dfb5SJens Axboe 	 * queue it up like normal since we can potentially save some
1267e167dfb5SJens Axboe 	 * CPU this way.
1268e167dfb5SJens Axboe 	 */
1269f984df1fSShaohua Li 	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1270f984df1fSShaohua Li 	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1271f984df1fSShaohua Li 		struct request *old_rq = NULL;
127207068d5bSJens Axboe 
127307068d5bSJens Axboe 		blk_mq_bio_to_request(rq, bio);
127407068d5bSJens Axboe 
127507068d5bSJens Axboe 		/*
1276f984df1fSShaohua Li 		 * we do limited pluging. If bio can be merged, do merge.
1277f984df1fSShaohua Li 		 * Otherwise the existing request in the plug list will be
1278f984df1fSShaohua Li 		 * issued. So the plug list will have one request at most
127907068d5bSJens Axboe 		 */
1280f984df1fSShaohua Li 		if (plug) {
12815b3f341fSShaohua Li 			/*
12825b3f341fSShaohua Li 			 * The plug list might get flushed before this. If that
12835b3f341fSShaohua Li 			 * happens, same_queue_rq is invalid and plug list is empty
12845b3f341fSShaohua Li 			 **/
12855b3f341fSShaohua Li 			if (same_queue_rq && !list_empty(&plug->mq_list)) {
12865b3f341fSShaohua Li 				old_rq = same_queue_rq;
1287f984df1fSShaohua Li 				list_del_init(&old_rq->queuelist);
128807068d5bSJens Axboe 			}
1289f984df1fSShaohua Li 			list_add_tail(&rq->queuelist, &plug->mq_list);
1290f984df1fSShaohua Li 		} else /* is_sync */
1291f984df1fSShaohua Li 			old_rq = rq;
1292f984df1fSShaohua Li 		blk_mq_put_ctx(data.ctx);
1293f984df1fSShaohua Li 		if (!old_rq)
1294239ad215SShaohua Li 			return;
1295f984df1fSShaohua Li 		if (!blk_mq_direct_issue_request(old_rq))
1296f984df1fSShaohua Li 			return;
1297f984df1fSShaohua Li 		blk_mq_insert_request(old_rq, false, true, true);
1298f984df1fSShaohua Li 		return;
129907068d5bSJens Axboe 	}
130007068d5bSJens Axboe 
130107068d5bSJens Axboe 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
130207068d5bSJens Axboe 		/*
130307068d5bSJens Axboe 		 * For a SYNC request, send it to the hardware immediately. For
130407068d5bSJens Axboe 		 * an ASYNC request, just ensure that we run it later on. The
130507068d5bSJens Axboe 		 * latter allows for merging opportunities and more efficient
130607068d5bSJens Axboe 		 * dispatching.
130707068d5bSJens Axboe 		 */
130807068d5bSJens Axboe run_queue:
130907068d5bSJens Axboe 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
131007068d5bSJens Axboe 	}
131107068d5bSJens Axboe 	blk_mq_put_ctx(data.ctx);
131207068d5bSJens Axboe }
131307068d5bSJens Axboe 
131407068d5bSJens Axboe /*
131507068d5bSJens Axboe  * Single hardware queue variant. This will attempt to use any per-process
131607068d5bSJens Axboe  * plug for merging and IO deferral.
131707068d5bSJens Axboe  */
131807068d5bSJens Axboe static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
131907068d5bSJens Axboe {
132007068d5bSJens Axboe 	const int is_sync = rw_is_sync(bio->bi_rw);
132107068d5bSJens Axboe 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1322e6c4438bSJeff Moyer 	struct blk_plug *plug;
1323e6c4438bSJeff Moyer 	unsigned int request_count = 0;
132407068d5bSJens Axboe 	struct blk_map_ctx data;
132507068d5bSJens Axboe 	struct request *rq;
132607068d5bSJens Axboe 
132707068d5bSJens Axboe 	blk_queue_bounce(q, &bio);
132807068d5bSJens Axboe 
132907068d5bSJens Axboe 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
13304246a0b6SChristoph Hellwig 		bio_io_error(bio);
133107068d5bSJens Axboe 		return;
133207068d5bSJens Axboe 	}
133307068d5bSJens Axboe 
133454efd50bSKent Overstreet 	blk_queue_split(q, &bio, q->bio_split);
133554efd50bSKent Overstreet 
1336e6c4438bSJeff Moyer 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
13375b3f341fSShaohua Li 	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
133807068d5bSJens Axboe 		return;
133907068d5bSJens Axboe 
134007068d5bSJens Axboe 	rq = blk_mq_map_request(q, bio, &data);
1341ff87bcecSJens Axboe 	if (unlikely(!rq))
1342ff87bcecSJens Axboe 		return;
1343320ae51fSJens Axboe 
1344320ae51fSJens Axboe 	if (unlikely(is_flush_fua)) {
1345320ae51fSJens Axboe 		blk_mq_bio_to_request(rq, bio);
1346320ae51fSJens Axboe 		blk_insert_flush(rq);
1347320ae51fSJens Axboe 		goto run_queue;
1348320ae51fSJens Axboe 	}
1349320ae51fSJens Axboe 
1350320ae51fSJens Axboe 	/*
1351320ae51fSJens Axboe 	 * A task plug currently exists. Since this is completely lockless,
1352320ae51fSJens Axboe 	 * utilize that to temporarily store requests until the task is
1353320ae51fSJens Axboe 	 * either done or scheduled away.
1354320ae51fSJens Axboe 	 */
1355e6c4438bSJeff Moyer 	plug = current->plug;
1356320ae51fSJens Axboe 	if (plug) {
1357320ae51fSJens Axboe 		blk_mq_bio_to_request(rq, bio);
135892f399c7SShaohua Li 		if (list_empty(&plug->mq_list))
1359320ae51fSJens Axboe 			trace_block_plug(q);
1360320ae51fSJens Axboe 		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1361320ae51fSJens Axboe 			blk_flush_plug_list(plug, false);
1362320ae51fSJens Axboe 			trace_block_plug(q);
1363320ae51fSJens Axboe 		}
1364320ae51fSJens Axboe 		list_add_tail(&rq->queuelist, &plug->mq_list);
136507068d5bSJens Axboe 		blk_mq_put_ctx(data.ctx);
1366320ae51fSJens Axboe 		return;
1367320ae51fSJens Axboe 	}
1368320ae51fSJens Axboe 
136907068d5bSJens Axboe 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1370320ae51fSJens Axboe 		/*
137107068d5bSJens Axboe 		 * For a SYNC request, send it to the hardware immediately. For
137207068d5bSJens Axboe 		 * an ASYNC request, just ensure that we run it later on. The
137307068d5bSJens Axboe 		 * latter allows for merging opportunities and more efficient
137407068d5bSJens Axboe 		 * dispatching.
1375320ae51fSJens Axboe 		 */
1376320ae51fSJens Axboe run_queue:
137707068d5bSJens Axboe 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
137807068d5bSJens Axboe 	}
137907068d5bSJens Axboe 
138007068d5bSJens Axboe 	blk_mq_put_ctx(data.ctx);
1381320ae51fSJens Axboe }
1382320ae51fSJens Axboe 
1383320ae51fSJens Axboe /*
1384320ae51fSJens Axboe  * Default mapping to a software queue, since we use one per CPU.
1385320ae51fSJens Axboe  */
1386320ae51fSJens Axboe struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1387320ae51fSJens Axboe {
1388320ae51fSJens Axboe 	return q->queue_hw_ctx[q->mq_map[cpu]];
1389320ae51fSJens Axboe }
1390320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_map_queue);
1391320ae51fSJens Axboe 
139224d2f903SChristoph Hellwig static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
139324d2f903SChristoph Hellwig 		struct blk_mq_tags *tags, unsigned int hctx_idx)
1394320ae51fSJens Axboe {
1395320ae51fSJens Axboe 	struct page *page;
1396320ae51fSJens Axboe 
139724d2f903SChristoph Hellwig 	if (tags->rqs && set->ops->exit_request) {
1398e9b267d9SChristoph Hellwig 		int i;
1399e9b267d9SChristoph Hellwig 
140024d2f903SChristoph Hellwig 		for (i = 0; i < tags->nr_tags; i++) {
140124d2f903SChristoph Hellwig 			if (!tags->rqs[i])
1402e9b267d9SChristoph Hellwig 				continue;
140324d2f903SChristoph Hellwig 			set->ops->exit_request(set->driver_data, tags->rqs[i],
140424d2f903SChristoph Hellwig 						hctx_idx, i);
1405a5164405SJens Axboe 			tags->rqs[i] = NULL;
1406e9b267d9SChristoph Hellwig 		}
1407e9b267d9SChristoph Hellwig 	}
1408e9b267d9SChristoph Hellwig 
140924d2f903SChristoph Hellwig 	while (!list_empty(&tags->page_list)) {
141024d2f903SChristoph Hellwig 		page = list_first_entry(&tags->page_list, struct page, lru);
14116753471cSDave Hansen 		list_del_init(&page->lru);
1412f75782e4SCatalin Marinas 		/*
1413f75782e4SCatalin Marinas 		 * Remove kmemleak object previously allocated in
1414f75782e4SCatalin Marinas 		 * blk_mq_init_rq_map().
1415f75782e4SCatalin Marinas 		 */
1416f75782e4SCatalin Marinas 		kmemleak_free(page_address(page));
1417320ae51fSJens Axboe 		__free_pages(page, page->private);
1418320ae51fSJens Axboe 	}
1419320ae51fSJens Axboe 
142024d2f903SChristoph Hellwig 	kfree(tags->rqs);
1421320ae51fSJens Axboe 
142224d2f903SChristoph Hellwig 	blk_mq_free_tags(tags);
1423320ae51fSJens Axboe }
1424320ae51fSJens Axboe 
1425320ae51fSJens Axboe static size_t order_to_size(unsigned int order)
1426320ae51fSJens Axboe {
14274ca08500SMing Lei 	return (size_t)PAGE_SIZE << order;
1428320ae51fSJens Axboe }
1429320ae51fSJens Axboe 
143024d2f903SChristoph Hellwig static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
143124d2f903SChristoph Hellwig 		unsigned int hctx_idx)
1432320ae51fSJens Axboe {
143324d2f903SChristoph Hellwig 	struct blk_mq_tags *tags;
1434320ae51fSJens Axboe 	unsigned int i, j, entries_per_page, max_order = 4;
1435320ae51fSJens Axboe 	size_t rq_size, left;
1436320ae51fSJens Axboe 
143724d2f903SChristoph Hellwig 	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
143824391c0dSShaohua Li 				set->numa_node,
143924391c0dSShaohua Li 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
144024d2f903SChristoph Hellwig 	if (!tags)
144124d2f903SChristoph Hellwig 		return NULL;
1442320ae51fSJens Axboe 
144324d2f903SChristoph Hellwig 	INIT_LIST_HEAD(&tags->page_list);
144424d2f903SChristoph Hellwig 
1445a5164405SJens Axboe 	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1446a5164405SJens Axboe 				 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1447a5164405SJens Axboe 				 set->numa_node);
144824d2f903SChristoph Hellwig 	if (!tags->rqs) {
144924d2f903SChristoph Hellwig 		blk_mq_free_tags(tags);
145024d2f903SChristoph Hellwig 		return NULL;
145124d2f903SChristoph Hellwig 	}
1452320ae51fSJens Axboe 
1453320ae51fSJens Axboe 	/*
1454320ae51fSJens Axboe 	 * rq_size is the size of the request plus driver payload, rounded
1455320ae51fSJens Axboe 	 * to the cacheline size
1456320ae51fSJens Axboe 	 */
145724d2f903SChristoph Hellwig 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
1458320ae51fSJens Axboe 				cache_line_size());
145924d2f903SChristoph Hellwig 	left = rq_size * set->queue_depth;
1460320ae51fSJens Axboe 
146124d2f903SChristoph Hellwig 	for (i = 0; i < set->queue_depth; ) {
1462320ae51fSJens Axboe 		int this_order = max_order;
1463320ae51fSJens Axboe 		struct page *page;
1464320ae51fSJens Axboe 		int to_do;
1465320ae51fSJens Axboe 		void *p;
1466320ae51fSJens Axboe 
1467320ae51fSJens Axboe 		while (left < order_to_size(this_order - 1) && this_order)
1468320ae51fSJens Axboe 			this_order--;
1469320ae51fSJens Axboe 
1470320ae51fSJens Axboe 		do {
1471a5164405SJens Axboe 			page = alloc_pages_node(set->numa_node,
1472ac211175SLinus Torvalds 				GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
147324d2f903SChristoph Hellwig 				this_order);
1474320ae51fSJens Axboe 			if (page)
1475320ae51fSJens Axboe 				break;
1476320ae51fSJens Axboe 			if (!this_order--)
1477320ae51fSJens Axboe 				break;
1478320ae51fSJens Axboe 			if (order_to_size(this_order) < rq_size)
1479320ae51fSJens Axboe 				break;
1480320ae51fSJens Axboe 		} while (1);
1481320ae51fSJens Axboe 
1482320ae51fSJens Axboe 		if (!page)
148324d2f903SChristoph Hellwig 			goto fail;
1484320ae51fSJens Axboe 
1485320ae51fSJens Axboe 		page->private = this_order;
148624d2f903SChristoph Hellwig 		list_add_tail(&page->lru, &tags->page_list);
1487320ae51fSJens Axboe 
1488320ae51fSJens Axboe 		p = page_address(page);
1489f75782e4SCatalin Marinas 		/*
1490f75782e4SCatalin Marinas 		 * Allow kmemleak to scan these pages as they contain pointers
1491f75782e4SCatalin Marinas 		 * to additional allocations like via ops->init_request().
1492f75782e4SCatalin Marinas 		 */
1493f75782e4SCatalin Marinas 		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
1494320ae51fSJens Axboe 		entries_per_page = order_to_size(this_order) / rq_size;
149524d2f903SChristoph Hellwig 		to_do = min(entries_per_page, set->queue_depth - i);
1496320ae51fSJens Axboe 		left -= to_do * rq_size;
1497320ae51fSJens Axboe 		for (j = 0; j < to_do; j++) {
149824d2f903SChristoph Hellwig 			tags->rqs[i] = p;
149924d2f903SChristoph Hellwig 			if (set->ops->init_request) {
150024d2f903SChristoph Hellwig 				if (set->ops->init_request(set->driver_data,
150124d2f903SChristoph Hellwig 						tags->rqs[i], hctx_idx, i,
1502a5164405SJens Axboe 						set->numa_node)) {
1503a5164405SJens Axboe 					tags->rqs[i] = NULL;
150424d2f903SChristoph Hellwig 					goto fail;
1505e9b267d9SChristoph Hellwig 				}
1506a5164405SJens Axboe 			}
1507e9b267d9SChristoph Hellwig 
1508320ae51fSJens Axboe 			p += rq_size;
1509320ae51fSJens Axboe 			i++;
1510320ae51fSJens Axboe 		}
1511320ae51fSJens Axboe 	}
151224d2f903SChristoph Hellwig 	return tags;
1513320ae51fSJens Axboe 
151424d2f903SChristoph Hellwig fail:
151524d2f903SChristoph Hellwig 	blk_mq_free_rq_map(set, tags, hctx_idx);
151624d2f903SChristoph Hellwig 	return NULL;
1517320ae51fSJens Axboe }
1518320ae51fSJens Axboe 
15191429d7c9SJens Axboe static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
15201429d7c9SJens Axboe {
15211429d7c9SJens Axboe 	kfree(bitmap->map);
15221429d7c9SJens Axboe }
15231429d7c9SJens Axboe 
15241429d7c9SJens Axboe static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
15251429d7c9SJens Axboe {
15261429d7c9SJens Axboe 	unsigned int bpw = 8, total, num_maps, i;
15271429d7c9SJens Axboe 
15281429d7c9SJens Axboe 	bitmap->bits_per_word = bpw;
15291429d7c9SJens Axboe 
15301429d7c9SJens Axboe 	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
15311429d7c9SJens Axboe 	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
15321429d7c9SJens Axboe 					GFP_KERNEL, node);
15331429d7c9SJens Axboe 	if (!bitmap->map)
15341429d7c9SJens Axboe 		return -ENOMEM;
15351429d7c9SJens Axboe 
15361429d7c9SJens Axboe 	total = nr_cpu_ids;
15371429d7c9SJens Axboe 	for (i = 0; i < num_maps; i++) {
15381429d7c9SJens Axboe 		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
15391429d7c9SJens Axboe 		total -= bitmap->map[i].depth;
15401429d7c9SJens Axboe 	}
15411429d7c9SJens Axboe 
15421429d7c9SJens Axboe 	return 0;
15431429d7c9SJens Axboe }
15441429d7c9SJens Axboe 
1545484b4061SJens Axboe static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1546484b4061SJens Axboe {
1547484b4061SJens Axboe 	struct request_queue *q = hctx->queue;
1548484b4061SJens Axboe 	struct blk_mq_ctx *ctx;
1549484b4061SJens Axboe 	LIST_HEAD(tmp);
1550484b4061SJens Axboe 
1551484b4061SJens Axboe 	/*
1552484b4061SJens Axboe 	 * Move ctx entries to new CPU, if this one is going away.
1553484b4061SJens Axboe 	 */
1554484b4061SJens Axboe 	ctx = __blk_mq_get_ctx(q, cpu);
1555484b4061SJens Axboe 
1556484b4061SJens Axboe 	spin_lock(&ctx->lock);
1557484b4061SJens Axboe 	if (!list_empty(&ctx->rq_list)) {
1558484b4061SJens Axboe 		list_splice_init(&ctx->rq_list, &tmp);
1559484b4061SJens Axboe 		blk_mq_hctx_clear_pending(hctx, ctx);
1560484b4061SJens Axboe 	}
1561484b4061SJens Axboe 	spin_unlock(&ctx->lock);
1562484b4061SJens Axboe 
1563484b4061SJens Axboe 	if (list_empty(&tmp))
1564484b4061SJens Axboe 		return NOTIFY_OK;
1565484b4061SJens Axboe 
1566484b4061SJens Axboe 	ctx = blk_mq_get_ctx(q);
1567484b4061SJens Axboe 	spin_lock(&ctx->lock);
1568484b4061SJens Axboe 
1569484b4061SJens Axboe 	while (!list_empty(&tmp)) {
1570484b4061SJens Axboe 		struct request *rq;
1571484b4061SJens Axboe 
1572484b4061SJens Axboe 		rq = list_first_entry(&tmp, struct request, queuelist);
1573484b4061SJens Axboe 		rq->mq_ctx = ctx;
1574484b4061SJens Axboe 		list_move_tail(&rq->queuelist, &ctx->rq_list);
1575484b4061SJens Axboe 	}
1576484b4061SJens Axboe 
1577484b4061SJens Axboe 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
1578484b4061SJens Axboe 	blk_mq_hctx_mark_pending(hctx, ctx);
1579484b4061SJens Axboe 
1580484b4061SJens Axboe 	spin_unlock(&ctx->lock);
1581484b4061SJens Axboe 
1582484b4061SJens Axboe 	blk_mq_run_hw_queue(hctx, true);
1583484b4061SJens Axboe 	blk_mq_put_ctx(ctx);
1584484b4061SJens Axboe 	return NOTIFY_OK;
1585484b4061SJens Axboe }
1586484b4061SJens Axboe 
1587484b4061SJens Axboe static int blk_mq_hctx_notify(void *data, unsigned long action,
1588484b4061SJens Axboe 			      unsigned int cpu)
1589484b4061SJens Axboe {
1590484b4061SJens Axboe 	struct blk_mq_hw_ctx *hctx = data;
1591484b4061SJens Axboe 
1592484b4061SJens Axboe 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1593484b4061SJens Axboe 		return blk_mq_hctx_cpu_offline(hctx, cpu);
15942a34c087SMing Lei 
15952a34c087SMing Lei 	/*
15962a34c087SMing Lei 	 * In case of CPU online, tags may be reallocated
15972a34c087SMing Lei 	 * in blk_mq_map_swqueue() after mapping is updated.
15982a34c087SMing Lei 	 */
1599484b4061SJens Axboe 
1600484b4061SJens Axboe 	return NOTIFY_OK;
1601484b4061SJens Axboe }
1602484b4061SJens Axboe 
1603c3b4afcaSMing Lei /* hctx->ctxs will be freed in queue's release handler */
160408e98fc6SMing Lei static void blk_mq_exit_hctx(struct request_queue *q,
160508e98fc6SMing Lei 		struct blk_mq_tag_set *set,
160608e98fc6SMing Lei 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
160708e98fc6SMing Lei {
1608f70ced09SMing Lei 	unsigned flush_start_tag = set->queue_depth;
1609f70ced09SMing Lei 
161008e98fc6SMing Lei 	blk_mq_tag_idle(hctx);
161108e98fc6SMing Lei 
1612f70ced09SMing Lei 	if (set->ops->exit_request)
1613f70ced09SMing Lei 		set->ops->exit_request(set->driver_data,
1614f70ced09SMing Lei 				       hctx->fq->flush_rq, hctx_idx,
1615f70ced09SMing Lei 				       flush_start_tag + hctx_idx);
1616f70ced09SMing Lei 
161708e98fc6SMing Lei 	if (set->ops->exit_hctx)
161808e98fc6SMing Lei 		set->ops->exit_hctx(hctx, hctx_idx);
161908e98fc6SMing Lei 
162008e98fc6SMing Lei 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1621f70ced09SMing Lei 	blk_free_flush_queue(hctx->fq);
162208e98fc6SMing Lei 	blk_mq_free_bitmap(&hctx->ctx_map);
162308e98fc6SMing Lei }
162408e98fc6SMing Lei 
1625624dbe47SMing Lei static void blk_mq_exit_hw_queues(struct request_queue *q,
1626624dbe47SMing Lei 		struct blk_mq_tag_set *set, int nr_queue)
1627624dbe47SMing Lei {
1628624dbe47SMing Lei 	struct blk_mq_hw_ctx *hctx;
1629624dbe47SMing Lei 	unsigned int i;
1630624dbe47SMing Lei 
1631624dbe47SMing Lei 	queue_for_each_hw_ctx(q, hctx, i) {
1632624dbe47SMing Lei 		if (i == nr_queue)
1633624dbe47SMing Lei 			break;
163408e98fc6SMing Lei 		blk_mq_exit_hctx(q, set, hctx, i);
1635624dbe47SMing Lei 	}
1636624dbe47SMing Lei }
1637624dbe47SMing Lei 
1638624dbe47SMing Lei static void blk_mq_free_hw_queues(struct request_queue *q,
1639624dbe47SMing Lei 		struct blk_mq_tag_set *set)
1640624dbe47SMing Lei {
1641624dbe47SMing Lei 	struct blk_mq_hw_ctx *hctx;
1642624dbe47SMing Lei 	unsigned int i;
1643624dbe47SMing Lei 
1644e09aae7eSMing Lei 	queue_for_each_hw_ctx(q, hctx, i)
1645624dbe47SMing Lei 		free_cpumask_var(hctx->cpumask);
1646624dbe47SMing Lei }
1647624dbe47SMing Lei 
164808e98fc6SMing Lei static int blk_mq_init_hctx(struct request_queue *q,
164908e98fc6SMing Lei 		struct blk_mq_tag_set *set,
165008e98fc6SMing Lei 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1651320ae51fSJens Axboe {
1652320ae51fSJens Axboe 	int node;
1653f70ced09SMing Lei 	unsigned flush_start_tag = set->queue_depth;
1654320ae51fSJens Axboe 
1655320ae51fSJens Axboe 	node = hctx->numa_node;
1656320ae51fSJens Axboe 	if (node == NUMA_NO_NODE)
165724d2f903SChristoph Hellwig 		node = hctx->numa_node = set->numa_node;
1658320ae51fSJens Axboe 
165970f4db63SChristoph Hellwig 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
166070f4db63SChristoph Hellwig 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1661320ae51fSJens Axboe 	spin_lock_init(&hctx->lock);
1662320ae51fSJens Axboe 	INIT_LIST_HEAD(&hctx->dispatch);
1663320ae51fSJens Axboe 	hctx->queue = q;
166408e98fc6SMing Lei 	hctx->queue_num = hctx_idx;
166524d2f903SChristoph Hellwig 	hctx->flags = set->flags;
1666320ae51fSJens Axboe 
1667320ae51fSJens Axboe 	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1668320ae51fSJens Axboe 					blk_mq_hctx_notify, hctx);
1669320ae51fSJens Axboe 	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1670320ae51fSJens Axboe 
167108e98fc6SMing Lei 	hctx->tags = set->tags[hctx_idx];
1672320ae51fSJens Axboe 
1673320ae51fSJens Axboe 	/*
1674a68aafa5SJens Axboe 	 * Allocate space for all possible cpus to avoid allocation at
1675320ae51fSJens Axboe 	 * runtime
1676320ae51fSJens Axboe 	 */
1677320ae51fSJens Axboe 	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1678320ae51fSJens Axboe 					GFP_KERNEL, node);
1679320ae51fSJens Axboe 	if (!hctx->ctxs)
168008e98fc6SMing Lei 		goto unregister_cpu_notifier;
1681320ae51fSJens Axboe 
16821429d7c9SJens Axboe 	if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
168308e98fc6SMing Lei 		goto free_ctxs;
1684320ae51fSJens Axboe 
1685320ae51fSJens Axboe 	hctx->nr_ctx = 0;
1686320ae51fSJens Axboe 
168724d2f903SChristoph Hellwig 	if (set->ops->init_hctx &&
168808e98fc6SMing Lei 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
168908e98fc6SMing Lei 		goto free_bitmap;
169008e98fc6SMing Lei 
1691f70ced09SMing Lei 	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1692f70ced09SMing Lei 	if (!hctx->fq)
1693f70ced09SMing Lei 		goto exit_hctx;
1694f70ced09SMing Lei 
1695f70ced09SMing Lei 	if (set->ops->init_request &&
1696f70ced09SMing Lei 	    set->ops->init_request(set->driver_data,
1697f70ced09SMing Lei 				   hctx->fq->flush_rq, hctx_idx,
1698f70ced09SMing Lei 				   flush_start_tag + hctx_idx, node))
1699f70ced09SMing Lei 		goto free_fq;
1700f70ced09SMing Lei 
170108e98fc6SMing Lei 	return 0;
170208e98fc6SMing Lei 
1703f70ced09SMing Lei  free_fq:
1704f70ced09SMing Lei 	kfree(hctx->fq);
1705f70ced09SMing Lei  exit_hctx:
1706f70ced09SMing Lei 	if (set->ops->exit_hctx)
1707f70ced09SMing Lei 		set->ops->exit_hctx(hctx, hctx_idx);
170808e98fc6SMing Lei  free_bitmap:
170908e98fc6SMing Lei 	blk_mq_free_bitmap(&hctx->ctx_map);
171008e98fc6SMing Lei  free_ctxs:
171108e98fc6SMing Lei 	kfree(hctx->ctxs);
171208e98fc6SMing Lei  unregister_cpu_notifier:
171308e98fc6SMing Lei 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
171408e98fc6SMing Lei 
171508e98fc6SMing Lei 	return -1;
171608e98fc6SMing Lei }
171708e98fc6SMing Lei 
171808e98fc6SMing Lei static int blk_mq_init_hw_queues(struct request_queue *q,
171908e98fc6SMing Lei 		struct blk_mq_tag_set *set)
172008e98fc6SMing Lei {
172108e98fc6SMing Lei 	struct blk_mq_hw_ctx *hctx;
172208e98fc6SMing Lei 	unsigned int i;
172308e98fc6SMing Lei 
172408e98fc6SMing Lei 	/*
172508e98fc6SMing Lei 	 * Initialize hardware queues
172608e98fc6SMing Lei 	 */
172708e98fc6SMing Lei 	queue_for_each_hw_ctx(q, hctx, i) {
172808e98fc6SMing Lei 		if (blk_mq_init_hctx(q, set, hctx, i))
1729320ae51fSJens Axboe 			break;
1730320ae51fSJens Axboe 	}
1731320ae51fSJens Axboe 
1732320ae51fSJens Axboe 	if (i == q->nr_hw_queues)
1733320ae51fSJens Axboe 		return 0;
1734320ae51fSJens Axboe 
1735320ae51fSJens Axboe 	/*
1736320ae51fSJens Axboe 	 * Init failed
1737320ae51fSJens Axboe 	 */
1738624dbe47SMing Lei 	blk_mq_exit_hw_queues(q, set, i);
1739320ae51fSJens Axboe 
1740320ae51fSJens Axboe 	return 1;
1741320ae51fSJens Axboe }
1742320ae51fSJens Axboe 
1743320ae51fSJens Axboe static void blk_mq_init_cpu_queues(struct request_queue *q,
1744320ae51fSJens Axboe 				   unsigned int nr_hw_queues)
1745320ae51fSJens Axboe {
1746320ae51fSJens Axboe 	unsigned int i;
1747320ae51fSJens Axboe 
1748320ae51fSJens Axboe 	for_each_possible_cpu(i) {
1749320ae51fSJens Axboe 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1750320ae51fSJens Axboe 		struct blk_mq_hw_ctx *hctx;
1751320ae51fSJens Axboe 
1752320ae51fSJens Axboe 		memset(__ctx, 0, sizeof(*__ctx));
1753320ae51fSJens Axboe 		__ctx->cpu = i;
1754320ae51fSJens Axboe 		spin_lock_init(&__ctx->lock);
1755320ae51fSJens Axboe 		INIT_LIST_HEAD(&__ctx->rq_list);
1756320ae51fSJens Axboe 		__ctx->queue = q;
1757320ae51fSJens Axboe 
1758320ae51fSJens Axboe 		/* If the cpu isn't online, the cpu is mapped to first hctx */
1759320ae51fSJens Axboe 		if (!cpu_online(i))
1760320ae51fSJens Axboe 			continue;
1761320ae51fSJens Axboe 
1762e4043dcfSJens Axboe 		hctx = q->mq_ops->map_queue(q, i);
1763e4043dcfSJens Axboe 
1764320ae51fSJens Axboe 		/*
1765320ae51fSJens Axboe 		 * Set local node, IFF we have more than one hw queue. If
1766320ae51fSJens Axboe 		 * not, we remain on the home node of the device
1767320ae51fSJens Axboe 		 */
1768320ae51fSJens Axboe 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1769320ae51fSJens Axboe 			hctx->numa_node = cpu_to_node(i);
1770320ae51fSJens Axboe 	}
1771320ae51fSJens Axboe }
1772320ae51fSJens Axboe 
17735778322eSAkinobu Mita static void blk_mq_map_swqueue(struct request_queue *q,
17745778322eSAkinobu Mita 			       const struct cpumask *online_mask)
1775320ae51fSJens Axboe {
1776320ae51fSJens Axboe 	unsigned int i;
1777320ae51fSJens Axboe 	struct blk_mq_hw_ctx *hctx;
1778320ae51fSJens Axboe 	struct blk_mq_ctx *ctx;
17792a34c087SMing Lei 	struct blk_mq_tag_set *set = q->tag_set;
1780320ae51fSJens Axboe 
178160de074bSAkinobu Mita 	/*
178260de074bSAkinobu Mita 	 * Avoid others reading imcomplete hctx->cpumask through sysfs
178360de074bSAkinobu Mita 	 */
178460de074bSAkinobu Mita 	mutex_lock(&q->sysfs_lock);
178560de074bSAkinobu Mita 
1786320ae51fSJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
1787e4043dcfSJens Axboe 		cpumask_clear(hctx->cpumask);
1788320ae51fSJens Axboe 		hctx->nr_ctx = 0;
1789320ae51fSJens Axboe 	}
1790320ae51fSJens Axboe 
1791320ae51fSJens Axboe 	/*
1792320ae51fSJens Axboe 	 * Map software to hardware queues
1793320ae51fSJens Axboe 	 */
1794320ae51fSJens Axboe 	queue_for_each_ctx(q, ctx, i) {
1795320ae51fSJens Axboe 		/* If the cpu isn't online, the cpu is mapped to first hctx */
17965778322eSAkinobu Mita 		if (!cpumask_test_cpu(i, online_mask))
1797e4043dcfSJens Axboe 			continue;
1798e4043dcfSJens Axboe 
1799320ae51fSJens Axboe 		hctx = q->mq_ops->map_queue(q, i);
1800e4043dcfSJens Axboe 		cpumask_set_cpu(i, hctx->cpumask);
1801320ae51fSJens Axboe 		ctx->index_hw = hctx->nr_ctx;
1802320ae51fSJens Axboe 		hctx->ctxs[hctx->nr_ctx++] = ctx;
1803320ae51fSJens Axboe 	}
1804506e931fSJens Axboe 
180560de074bSAkinobu Mita 	mutex_unlock(&q->sysfs_lock);
180660de074bSAkinobu Mita 
1807506e931fSJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
1808889fa31fSChong Yuan 		struct blk_mq_ctxmap *map = &hctx->ctx_map;
1809889fa31fSChong Yuan 
1810484b4061SJens Axboe 		/*
1811a68aafa5SJens Axboe 		 * If no software queues are mapped to this hardware queue,
1812a68aafa5SJens Axboe 		 * disable it and free the request entries.
1813484b4061SJens Axboe 		 */
1814484b4061SJens Axboe 		if (!hctx->nr_ctx) {
1815484b4061SJens Axboe 			if (set->tags[i]) {
1816484b4061SJens Axboe 				blk_mq_free_rq_map(set, set->tags[i], i);
1817484b4061SJens Axboe 				set->tags[i] = NULL;
1818484b4061SJens Axboe 			}
18192a34c087SMing Lei 			hctx->tags = NULL;
1820484b4061SJens Axboe 			continue;
1821484b4061SJens Axboe 		}
1822484b4061SJens Axboe 
18232a34c087SMing Lei 		/* unmapped hw queue can be remapped after CPU topo changed */
18242a34c087SMing Lei 		if (!set->tags[i])
18252a34c087SMing Lei 			set->tags[i] = blk_mq_init_rq_map(set, i);
18262a34c087SMing Lei 		hctx->tags = set->tags[i];
18272a34c087SMing Lei 		WARN_ON(!hctx->tags);
18282a34c087SMing Lei 
1829484b4061SJens Axboe 		/*
1830889fa31fSChong Yuan 		 * Set the map size to the number of mapped software queues.
1831889fa31fSChong Yuan 		 * This is more accurate and more efficient than looping
1832889fa31fSChong Yuan 		 * over all possibly mapped software queues.
1833889fa31fSChong Yuan 		 */
1834569fd0ceSJens Axboe 		map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
1835889fa31fSChong Yuan 
1836889fa31fSChong Yuan 		/*
1837484b4061SJens Axboe 		 * Initialize batch roundrobin counts
1838484b4061SJens Axboe 		 */
1839506e931fSJens Axboe 		hctx->next_cpu = cpumask_first(hctx->cpumask);
1840506e931fSJens Axboe 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1841506e931fSJens Axboe 	}
18421356aae0SAkinobu Mita 
18431356aae0SAkinobu Mita 	queue_for_each_ctx(q, ctx, i) {
18445778322eSAkinobu Mita 		if (!cpumask_test_cpu(i, online_mask))
18451356aae0SAkinobu Mita 			continue;
18461356aae0SAkinobu Mita 
18471356aae0SAkinobu Mita 		hctx = q->mq_ops->map_queue(q, i);
18481356aae0SAkinobu Mita 		cpumask_set_cpu(i, hctx->tags->cpumask);
18491356aae0SAkinobu Mita 	}
1850320ae51fSJens Axboe }
1851320ae51fSJens Axboe 
18520d2602caSJens Axboe static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
18530d2602caSJens Axboe {
18540d2602caSJens Axboe 	struct blk_mq_hw_ctx *hctx;
18550d2602caSJens Axboe 	struct request_queue *q;
18560d2602caSJens Axboe 	bool shared;
18570d2602caSJens Axboe 	int i;
18580d2602caSJens Axboe 
18590d2602caSJens Axboe 	if (set->tag_list.next == set->tag_list.prev)
18600d2602caSJens Axboe 		shared = false;
18610d2602caSJens Axboe 	else
18620d2602caSJens Axboe 		shared = true;
18630d2602caSJens Axboe 
18640d2602caSJens Axboe 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
18650d2602caSJens Axboe 		blk_mq_freeze_queue(q);
18660d2602caSJens Axboe 
18670d2602caSJens Axboe 		queue_for_each_hw_ctx(q, hctx, i) {
18680d2602caSJens Axboe 			if (shared)
18690d2602caSJens Axboe 				hctx->flags |= BLK_MQ_F_TAG_SHARED;
18700d2602caSJens Axboe 			else
18710d2602caSJens Axboe 				hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
18720d2602caSJens Axboe 		}
18730d2602caSJens Axboe 		blk_mq_unfreeze_queue(q);
18740d2602caSJens Axboe 	}
18750d2602caSJens Axboe }
18760d2602caSJens Axboe 
18770d2602caSJens Axboe static void blk_mq_del_queue_tag_set(struct request_queue *q)
18780d2602caSJens Axboe {
18790d2602caSJens Axboe 	struct blk_mq_tag_set *set = q->tag_set;
18800d2602caSJens Axboe 
18810d2602caSJens Axboe 	mutex_lock(&set->tag_list_lock);
18820d2602caSJens Axboe 	list_del_init(&q->tag_set_list);
18830d2602caSJens Axboe 	blk_mq_update_tag_set_depth(set);
18840d2602caSJens Axboe 	mutex_unlock(&set->tag_list_lock);
18850d2602caSJens Axboe }
18860d2602caSJens Axboe 
18870d2602caSJens Axboe static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
18880d2602caSJens Axboe 				     struct request_queue *q)
18890d2602caSJens Axboe {
18900d2602caSJens Axboe 	q->tag_set = set;
18910d2602caSJens Axboe 
18920d2602caSJens Axboe 	mutex_lock(&set->tag_list_lock);
18930d2602caSJens Axboe 	list_add_tail(&q->tag_set_list, &set->tag_list);
18940d2602caSJens Axboe 	blk_mq_update_tag_set_depth(set);
18950d2602caSJens Axboe 	mutex_unlock(&set->tag_list_lock);
18960d2602caSJens Axboe }
18970d2602caSJens Axboe 
1898e09aae7eSMing Lei /*
1899e09aae7eSMing Lei  * It is the actual release handler for mq, but we do it from
1900e09aae7eSMing Lei  * request queue's release handler for avoiding use-after-free
1901e09aae7eSMing Lei  * and headache because q->mq_kobj shouldn't have been introduced,
1902e09aae7eSMing Lei  * but we can't group ctx/kctx kobj without it.
1903e09aae7eSMing Lei  */
1904e09aae7eSMing Lei void blk_mq_release(struct request_queue *q)
1905e09aae7eSMing Lei {
1906e09aae7eSMing Lei 	struct blk_mq_hw_ctx *hctx;
1907e09aae7eSMing Lei 	unsigned int i;
1908e09aae7eSMing Lei 
1909e09aae7eSMing Lei 	/* hctx kobj stays in hctx */
1910c3b4afcaSMing Lei 	queue_for_each_hw_ctx(q, hctx, i) {
1911c3b4afcaSMing Lei 		if (!hctx)
1912c3b4afcaSMing Lei 			continue;
1913c3b4afcaSMing Lei 		kfree(hctx->ctxs);
1914e09aae7eSMing Lei 		kfree(hctx);
1915c3b4afcaSMing Lei 	}
1916e09aae7eSMing Lei 
1917a723bab3SAkinobu Mita 	kfree(q->mq_map);
1918a723bab3SAkinobu Mita 	q->mq_map = NULL;
1919a723bab3SAkinobu Mita 
1920e09aae7eSMing Lei 	kfree(q->queue_hw_ctx);
1921e09aae7eSMing Lei 
1922e09aae7eSMing Lei 	/* ctx kobj stays in queue_ctx */
1923e09aae7eSMing Lei 	free_percpu(q->queue_ctx);
1924e09aae7eSMing Lei }
1925e09aae7eSMing Lei 
192624d2f903SChristoph Hellwig struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1927320ae51fSJens Axboe {
1928b62c21b7SMike Snitzer 	struct request_queue *uninit_q, *q;
1929b62c21b7SMike Snitzer 
1930b62c21b7SMike Snitzer 	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1931b62c21b7SMike Snitzer 	if (!uninit_q)
1932b62c21b7SMike Snitzer 		return ERR_PTR(-ENOMEM);
1933b62c21b7SMike Snitzer 
1934b62c21b7SMike Snitzer 	q = blk_mq_init_allocated_queue(set, uninit_q);
1935b62c21b7SMike Snitzer 	if (IS_ERR(q))
1936b62c21b7SMike Snitzer 		blk_cleanup_queue(uninit_q);
1937b62c21b7SMike Snitzer 
1938b62c21b7SMike Snitzer 	return q;
1939b62c21b7SMike Snitzer }
1940b62c21b7SMike Snitzer EXPORT_SYMBOL(blk_mq_init_queue);
1941b62c21b7SMike Snitzer 
1942b62c21b7SMike Snitzer struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
1943b62c21b7SMike Snitzer 						  struct request_queue *q)
1944b62c21b7SMike Snitzer {
1945320ae51fSJens Axboe 	struct blk_mq_hw_ctx **hctxs;
1946e6cdb092SMing Lei 	struct blk_mq_ctx __percpu *ctx;
1947f14bbe77SJens Axboe 	unsigned int *map;
1948320ae51fSJens Axboe 	int i;
1949320ae51fSJens Axboe 
1950320ae51fSJens Axboe 	ctx = alloc_percpu(struct blk_mq_ctx);
1951320ae51fSJens Axboe 	if (!ctx)
1952320ae51fSJens Axboe 		return ERR_PTR(-ENOMEM);
1953320ae51fSJens Axboe 
195424d2f903SChristoph Hellwig 	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
195524d2f903SChristoph Hellwig 			set->numa_node);
1956320ae51fSJens Axboe 
1957320ae51fSJens Axboe 	if (!hctxs)
1958320ae51fSJens Axboe 		goto err_percpu;
1959320ae51fSJens Axboe 
1960f14bbe77SJens Axboe 	map = blk_mq_make_queue_map(set);
1961f14bbe77SJens Axboe 	if (!map)
1962f14bbe77SJens Axboe 		goto err_map;
1963f14bbe77SJens Axboe 
196424d2f903SChristoph Hellwig 	for (i = 0; i < set->nr_hw_queues; i++) {
1965f14bbe77SJens Axboe 		int node = blk_mq_hw_queue_to_node(map, i);
1966f14bbe77SJens Axboe 
1967cdef54ddSChristoph Hellwig 		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1968cdef54ddSChristoph Hellwig 					GFP_KERNEL, node);
1969320ae51fSJens Axboe 		if (!hctxs[i])
1970320ae51fSJens Axboe 			goto err_hctxs;
1971320ae51fSJens Axboe 
1972a86073e4SJens Axboe 		if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1973a86073e4SJens Axboe 						node))
1974e4043dcfSJens Axboe 			goto err_hctxs;
1975e4043dcfSJens Axboe 
19760d2602caSJens Axboe 		atomic_set(&hctxs[i]->nr_active, 0);
1977f14bbe77SJens Axboe 		hctxs[i]->numa_node = node;
1978320ae51fSJens Axboe 		hctxs[i]->queue_num = i;
1979320ae51fSJens Axboe 	}
1980320ae51fSJens Axboe 
1981320ae51fSJens Axboe 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1982e56f698bSMing Lei 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
1983320ae51fSJens Axboe 
1984320ae51fSJens Axboe 	q->nr_queues = nr_cpu_ids;
198524d2f903SChristoph Hellwig 	q->nr_hw_queues = set->nr_hw_queues;
1986f14bbe77SJens Axboe 	q->mq_map = map;
1987320ae51fSJens Axboe 
1988320ae51fSJens Axboe 	q->queue_ctx = ctx;
1989320ae51fSJens Axboe 	q->queue_hw_ctx = hctxs;
1990320ae51fSJens Axboe 
199124d2f903SChristoph Hellwig 	q->mq_ops = set->ops;
199294eddfbeSJens Axboe 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1993320ae51fSJens Axboe 
199405f1dd53SJens Axboe 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
199505f1dd53SJens Axboe 		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
199605f1dd53SJens Axboe 
19971be036e9SChristoph Hellwig 	q->sg_reserved_size = INT_MAX;
19981be036e9SChristoph Hellwig 
19996fca6a61SChristoph Hellwig 	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
20006fca6a61SChristoph Hellwig 	INIT_LIST_HEAD(&q->requeue_list);
20016fca6a61SChristoph Hellwig 	spin_lock_init(&q->requeue_lock);
20026fca6a61SChristoph Hellwig 
200307068d5bSJens Axboe 	if (q->nr_hw_queues > 1)
2004320ae51fSJens Axboe 		blk_queue_make_request(q, blk_mq_make_request);
200507068d5bSJens Axboe 	else
200607068d5bSJens Axboe 		blk_queue_make_request(q, blk_sq_make_request);
200707068d5bSJens Axboe 
2008eba71768SJens Axboe 	/*
2009eba71768SJens Axboe 	 * Do this after blk_queue_make_request() overrides it...
2010eba71768SJens Axboe 	 */
2011eba71768SJens Axboe 	q->nr_requests = set->queue_depth;
2012eba71768SJens Axboe 
201324d2f903SChristoph Hellwig 	if (set->ops->complete)
201424d2f903SChristoph Hellwig 		blk_queue_softirq_done(q, set->ops->complete);
201530a91cb4SChristoph Hellwig 
201624d2f903SChristoph Hellwig 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2017320ae51fSJens Axboe 
201824d2f903SChristoph Hellwig 	if (blk_mq_init_hw_queues(q, set))
2019b62c21b7SMike Snitzer 		goto err_hctxs;
202018741986SChristoph Hellwig 
20215778322eSAkinobu Mita 	get_online_cpus();
2022320ae51fSJens Axboe 	mutex_lock(&all_q_mutex);
20234593fdbeSAkinobu Mita 
2024320ae51fSJens Axboe 	list_add_tail(&q->all_q_node, &all_q_list);
20250d2602caSJens Axboe 	blk_mq_add_queue_tag_set(set, q);
20265778322eSAkinobu Mita 	blk_mq_map_swqueue(q, cpu_online_mask);
20270d2602caSJens Axboe 
20284593fdbeSAkinobu Mita 	mutex_unlock(&all_q_mutex);
20295778322eSAkinobu Mita 	put_online_cpus();
2030484b4061SJens Axboe 
2031320ae51fSJens Axboe 	return q;
203218741986SChristoph Hellwig 
2033320ae51fSJens Axboe err_hctxs:
2034f14bbe77SJens Axboe 	kfree(map);
203524d2f903SChristoph Hellwig 	for (i = 0; i < set->nr_hw_queues; i++) {
2036320ae51fSJens Axboe 		if (!hctxs[i])
2037320ae51fSJens Axboe 			break;
2038e4043dcfSJens Axboe 		free_cpumask_var(hctxs[i]->cpumask);
2039cdef54ddSChristoph Hellwig 		kfree(hctxs[i]);
2040320ae51fSJens Axboe 	}
2041f14bbe77SJens Axboe err_map:
2042320ae51fSJens Axboe 	kfree(hctxs);
2043320ae51fSJens Axboe err_percpu:
2044320ae51fSJens Axboe 	free_percpu(ctx);
2045320ae51fSJens Axboe 	return ERR_PTR(-ENOMEM);
2046320ae51fSJens Axboe }
2047b62c21b7SMike Snitzer EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2048320ae51fSJens Axboe 
2049320ae51fSJens Axboe void blk_mq_free_queue(struct request_queue *q)
2050320ae51fSJens Axboe {
2051624dbe47SMing Lei 	struct blk_mq_tag_set	*set = q->tag_set;
2052320ae51fSJens Axboe 
20530e626368SAkinobu Mita 	mutex_lock(&all_q_mutex);
20540e626368SAkinobu Mita 	list_del_init(&q->all_q_node);
20550e626368SAkinobu Mita 	mutex_unlock(&all_q_mutex);
20560e626368SAkinobu Mita 
20570d2602caSJens Axboe 	blk_mq_del_queue_tag_set(q);
20580d2602caSJens Axboe 
2059624dbe47SMing Lei 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2060624dbe47SMing Lei 	blk_mq_free_hw_queues(q, set);
2061320ae51fSJens Axboe }
2062320ae51fSJens Axboe 
2063320ae51fSJens Axboe /* Basically redo blk_mq_init_queue with queue frozen */
20645778322eSAkinobu Mita static void blk_mq_queue_reinit(struct request_queue *q,
20655778322eSAkinobu Mita 				const struct cpumask *online_mask)
2066320ae51fSJens Axboe {
20674ecd4fefSChristoph Hellwig 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2068320ae51fSJens Axboe 
206967aec14cSJens Axboe 	blk_mq_sysfs_unregister(q);
207067aec14cSJens Axboe 
20715778322eSAkinobu Mita 	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
2072320ae51fSJens Axboe 
2073320ae51fSJens Axboe 	/*
2074320ae51fSJens Axboe 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2075320ae51fSJens Axboe 	 * we should change hctx numa_node according to new topology (this
2076320ae51fSJens Axboe 	 * involves free and re-allocate memory, worthy doing?)
2077320ae51fSJens Axboe 	 */
2078320ae51fSJens Axboe 
20795778322eSAkinobu Mita 	blk_mq_map_swqueue(q, online_mask);
2080320ae51fSJens Axboe 
208167aec14cSJens Axboe 	blk_mq_sysfs_register(q);
2082320ae51fSJens Axboe }
2083320ae51fSJens Axboe 
2084f618ef7cSPaul Gortmaker static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2085320ae51fSJens Axboe 				      unsigned long action, void *hcpu)
2086320ae51fSJens Axboe {
2087320ae51fSJens Axboe 	struct request_queue *q;
20885778322eSAkinobu Mita 	int cpu = (unsigned long)hcpu;
20895778322eSAkinobu Mita 	/*
20905778322eSAkinobu Mita 	 * New online cpumask which is going to be set in this hotplug event.
20915778322eSAkinobu Mita 	 * Declare this cpumasks as global as cpu-hotplug operation is invoked
20925778322eSAkinobu Mita 	 * one-by-one and dynamically allocating this could result in a failure.
20935778322eSAkinobu Mita 	 */
20945778322eSAkinobu Mita 	static struct cpumask online_new;
2095320ae51fSJens Axboe 
2096320ae51fSJens Axboe 	/*
20975778322eSAkinobu Mita 	 * Before hotadded cpu starts handling requests, new mappings must
20985778322eSAkinobu Mita 	 * be established.  Otherwise, these requests in hw queue might
20995778322eSAkinobu Mita 	 * never be dispatched.
21005778322eSAkinobu Mita 	 *
21015778322eSAkinobu Mita 	 * For example, there is a single hw queue (hctx) and two CPU queues
21025778322eSAkinobu Mita 	 * (ctx0 for CPU0, and ctx1 for CPU1).
21035778322eSAkinobu Mita 	 *
21045778322eSAkinobu Mita 	 * Now CPU1 is just onlined and a request is inserted into
21055778322eSAkinobu Mita 	 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is
21065778322eSAkinobu Mita 	 * still zero.
21075778322eSAkinobu Mita 	 *
21085778322eSAkinobu Mita 	 * And then while running hw queue, flush_busy_ctxs() finds bit0 is
21095778322eSAkinobu Mita 	 * set in pending bitmap and tries to retrieve requests in
21105778322eSAkinobu Mita 	 * hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0,
21115778322eSAkinobu Mita 	 * so the request in ctx1->rq_list is ignored.
2112320ae51fSJens Axboe 	 */
21135778322eSAkinobu Mita 	switch (action & ~CPU_TASKS_FROZEN) {
21145778322eSAkinobu Mita 	case CPU_DEAD:
21155778322eSAkinobu Mita 	case CPU_UP_CANCELED:
21165778322eSAkinobu Mita 		cpumask_copy(&online_new, cpu_online_mask);
21175778322eSAkinobu Mita 		break;
21185778322eSAkinobu Mita 	case CPU_UP_PREPARE:
21195778322eSAkinobu Mita 		cpumask_copy(&online_new, cpu_online_mask);
21205778322eSAkinobu Mita 		cpumask_set_cpu(cpu, &online_new);
21215778322eSAkinobu Mita 		break;
21225778322eSAkinobu Mita 	default:
2123320ae51fSJens Axboe 		return NOTIFY_OK;
21245778322eSAkinobu Mita 	}
2125320ae51fSJens Axboe 
2126320ae51fSJens Axboe 	mutex_lock(&all_q_mutex);
2127f3af020bSTejun Heo 
2128f3af020bSTejun Heo 	/*
2129f3af020bSTejun Heo 	 * We need to freeze and reinit all existing queues.  Freezing
2130f3af020bSTejun Heo 	 * involves synchronous wait for an RCU grace period and doing it
2131f3af020bSTejun Heo 	 * one by one may take a long time.  Start freezing all queues in
2132f3af020bSTejun Heo 	 * one swoop and then wait for the completions so that freezing can
2133f3af020bSTejun Heo 	 * take place in parallel.
2134f3af020bSTejun Heo 	 */
2135f3af020bSTejun Heo 	list_for_each_entry(q, &all_q_list, all_q_node)
2136f3af020bSTejun Heo 		blk_mq_freeze_queue_start(q);
2137f054b56cSMing Lei 	list_for_each_entry(q, &all_q_list, all_q_node) {
2138f3af020bSTejun Heo 		blk_mq_freeze_queue_wait(q);
2139f3af020bSTejun Heo 
2140f054b56cSMing Lei 		/*
2141f054b56cSMing Lei 		 * timeout handler can't touch hw queue during the
2142f054b56cSMing Lei 		 * reinitialization
2143f054b56cSMing Lei 		 */
2144f054b56cSMing Lei 		del_timer_sync(&q->timeout);
2145f054b56cSMing Lei 	}
2146f054b56cSMing Lei 
2147320ae51fSJens Axboe 	list_for_each_entry(q, &all_q_list, all_q_node)
21485778322eSAkinobu Mita 		blk_mq_queue_reinit(q, &online_new);
2149f3af020bSTejun Heo 
2150f3af020bSTejun Heo 	list_for_each_entry(q, &all_q_list, all_q_node)
2151f3af020bSTejun Heo 		blk_mq_unfreeze_queue(q);
2152f3af020bSTejun Heo 
2153320ae51fSJens Axboe 	mutex_unlock(&all_q_mutex);
2154320ae51fSJens Axboe 	return NOTIFY_OK;
2155320ae51fSJens Axboe }
2156320ae51fSJens Axboe 
2157a5164405SJens Axboe static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2158a5164405SJens Axboe {
2159a5164405SJens Axboe 	int i;
2160a5164405SJens Axboe 
2161a5164405SJens Axboe 	for (i = 0; i < set->nr_hw_queues; i++) {
2162a5164405SJens Axboe 		set->tags[i] = blk_mq_init_rq_map(set, i);
2163a5164405SJens Axboe 		if (!set->tags[i])
2164a5164405SJens Axboe 			goto out_unwind;
2165a5164405SJens Axboe 	}
2166a5164405SJens Axboe 
2167a5164405SJens Axboe 	return 0;
2168a5164405SJens Axboe 
2169a5164405SJens Axboe out_unwind:
2170a5164405SJens Axboe 	while (--i >= 0)
2171a5164405SJens Axboe 		blk_mq_free_rq_map(set, set->tags[i], i);
2172a5164405SJens Axboe 
2173a5164405SJens Axboe 	return -ENOMEM;
2174a5164405SJens Axboe }
2175a5164405SJens Axboe 
2176a5164405SJens Axboe /*
2177a5164405SJens Axboe  * Allocate the request maps associated with this tag_set. Note that this
2178a5164405SJens Axboe  * may reduce the depth asked for, if memory is tight. set->queue_depth
2179a5164405SJens Axboe  * will be updated to reflect the allocated depth.
2180a5164405SJens Axboe  */
2181a5164405SJens Axboe static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2182a5164405SJens Axboe {
2183a5164405SJens Axboe 	unsigned int depth;
2184a5164405SJens Axboe 	int err;
2185a5164405SJens Axboe 
2186a5164405SJens Axboe 	depth = set->queue_depth;
2187a5164405SJens Axboe 	do {
2188a5164405SJens Axboe 		err = __blk_mq_alloc_rq_maps(set);
2189a5164405SJens Axboe 		if (!err)
2190a5164405SJens Axboe 			break;
2191a5164405SJens Axboe 
2192a5164405SJens Axboe 		set->queue_depth >>= 1;
2193a5164405SJens Axboe 		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2194a5164405SJens Axboe 			err = -ENOMEM;
2195a5164405SJens Axboe 			break;
2196a5164405SJens Axboe 		}
2197a5164405SJens Axboe 	} while (set->queue_depth);
2198a5164405SJens Axboe 
2199a5164405SJens Axboe 	if (!set->queue_depth || err) {
2200a5164405SJens Axboe 		pr_err("blk-mq: failed to allocate request map\n");
2201a5164405SJens Axboe 		return -ENOMEM;
2202a5164405SJens Axboe 	}
2203a5164405SJens Axboe 
2204a5164405SJens Axboe 	if (depth != set->queue_depth)
2205a5164405SJens Axboe 		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2206a5164405SJens Axboe 						depth, set->queue_depth);
2207a5164405SJens Axboe 
2208a5164405SJens Axboe 	return 0;
2209a5164405SJens Axboe }
2210a5164405SJens Axboe 
2211f26cdc85SKeith Busch struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
2212f26cdc85SKeith Busch {
2213f26cdc85SKeith Busch 	return tags->cpumask;
2214f26cdc85SKeith Busch }
2215f26cdc85SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
2216f26cdc85SKeith Busch 
2217a4391c64SJens Axboe /*
2218a4391c64SJens Axboe  * Alloc a tag set to be associated with one or more request queues.
2219a4391c64SJens Axboe  * May fail with EINVAL for various error conditions. May adjust the
2220a4391c64SJens Axboe  * requested depth down, if if it too large. In that case, the set
2221a4391c64SJens Axboe  * value will be stored in set->queue_depth.
2222a4391c64SJens Axboe  */
222324d2f903SChristoph Hellwig int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
222424d2f903SChristoph Hellwig {
2225205fb5f5SBart Van Assche 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2226205fb5f5SBart Van Assche 
222724d2f903SChristoph Hellwig 	if (!set->nr_hw_queues)
222824d2f903SChristoph Hellwig 		return -EINVAL;
2229a4391c64SJens Axboe 	if (!set->queue_depth)
223024d2f903SChristoph Hellwig 		return -EINVAL;
223124d2f903SChristoph Hellwig 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
223224d2f903SChristoph Hellwig 		return -EINVAL;
223324d2f903SChristoph Hellwig 
2234f9018ac9SXiaoguang Wang 	if (!set->ops->queue_rq || !set->ops->map_queue)
223524d2f903SChristoph Hellwig 		return -EINVAL;
223624d2f903SChristoph Hellwig 
2237a4391c64SJens Axboe 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2238a4391c64SJens Axboe 		pr_info("blk-mq: reduced tag depth to %u\n",
2239a4391c64SJens Axboe 			BLK_MQ_MAX_DEPTH);
2240a4391c64SJens Axboe 		set->queue_depth = BLK_MQ_MAX_DEPTH;
2241a4391c64SJens Axboe 	}
224224d2f903SChristoph Hellwig 
22436637fadfSShaohua Li 	/*
22446637fadfSShaohua Li 	 * If a crashdump is active, then we are potentially in a very
22456637fadfSShaohua Li 	 * memory constrained environment. Limit us to 1 queue and
22466637fadfSShaohua Li 	 * 64 tags to prevent using too much memory.
22476637fadfSShaohua Li 	 */
22486637fadfSShaohua Li 	if (is_kdump_kernel()) {
22496637fadfSShaohua Li 		set->nr_hw_queues = 1;
22506637fadfSShaohua Li 		set->queue_depth = min(64U, set->queue_depth);
22516637fadfSShaohua Li 	}
22526637fadfSShaohua Li 
225348479005SMing Lei 	set->tags = kmalloc_node(set->nr_hw_queues *
225448479005SMing Lei 				 sizeof(struct blk_mq_tags *),
225524d2f903SChristoph Hellwig 				 GFP_KERNEL, set->numa_node);
225624d2f903SChristoph Hellwig 	if (!set->tags)
2257a5164405SJens Axboe 		return -ENOMEM;
225824d2f903SChristoph Hellwig 
2259a5164405SJens Axboe 	if (blk_mq_alloc_rq_maps(set))
2260a5164405SJens Axboe 		goto enomem;
226124d2f903SChristoph Hellwig 
22620d2602caSJens Axboe 	mutex_init(&set->tag_list_lock);
22630d2602caSJens Axboe 	INIT_LIST_HEAD(&set->tag_list);
22640d2602caSJens Axboe 
226524d2f903SChristoph Hellwig 	return 0;
2266a5164405SJens Axboe enomem:
22675676e7b6SRobert Elliott 	kfree(set->tags);
22685676e7b6SRobert Elliott 	set->tags = NULL;
226924d2f903SChristoph Hellwig 	return -ENOMEM;
227024d2f903SChristoph Hellwig }
227124d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_alloc_tag_set);
227224d2f903SChristoph Hellwig 
227324d2f903SChristoph Hellwig void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
227424d2f903SChristoph Hellwig {
227524d2f903SChristoph Hellwig 	int i;
227624d2f903SChristoph Hellwig 
2277484b4061SJens Axboe 	for (i = 0; i < set->nr_hw_queues; i++) {
2278f26cdc85SKeith Busch 		if (set->tags[i]) {
227924d2f903SChristoph Hellwig 			blk_mq_free_rq_map(set, set->tags[i], i);
2280f26cdc85SKeith Busch 			free_cpumask_var(set->tags[i]->cpumask);
2281f26cdc85SKeith Busch 		}
2282484b4061SJens Axboe 	}
2283484b4061SJens Axboe 
2284981bd189SMing Lei 	kfree(set->tags);
22855676e7b6SRobert Elliott 	set->tags = NULL;
228624d2f903SChristoph Hellwig }
228724d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_free_tag_set);
228824d2f903SChristoph Hellwig 
2289e3a2b3f9SJens Axboe int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2290e3a2b3f9SJens Axboe {
2291e3a2b3f9SJens Axboe 	struct blk_mq_tag_set *set = q->tag_set;
2292e3a2b3f9SJens Axboe 	struct blk_mq_hw_ctx *hctx;
2293e3a2b3f9SJens Axboe 	int i, ret;
2294e3a2b3f9SJens Axboe 
2295e3a2b3f9SJens Axboe 	if (!set || nr > set->queue_depth)
2296e3a2b3f9SJens Axboe 		return -EINVAL;
2297e3a2b3f9SJens Axboe 
2298e3a2b3f9SJens Axboe 	ret = 0;
2299e3a2b3f9SJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
2300e3a2b3f9SJens Axboe 		ret = blk_mq_tag_update_depth(hctx->tags, nr);
2301e3a2b3f9SJens Axboe 		if (ret)
2302e3a2b3f9SJens Axboe 			break;
2303e3a2b3f9SJens Axboe 	}
2304e3a2b3f9SJens Axboe 
2305e3a2b3f9SJens Axboe 	if (!ret)
2306e3a2b3f9SJens Axboe 		q->nr_requests = nr;
2307e3a2b3f9SJens Axboe 
2308e3a2b3f9SJens Axboe 	return ret;
2309e3a2b3f9SJens Axboe }
2310e3a2b3f9SJens Axboe 
2311676141e4SJens Axboe void blk_mq_disable_hotplug(void)
2312676141e4SJens Axboe {
2313676141e4SJens Axboe 	mutex_lock(&all_q_mutex);
2314676141e4SJens Axboe }
2315676141e4SJens Axboe 
2316676141e4SJens Axboe void blk_mq_enable_hotplug(void)
2317676141e4SJens Axboe {
2318676141e4SJens Axboe 	mutex_unlock(&all_q_mutex);
2319676141e4SJens Axboe }
2320676141e4SJens Axboe 
2321320ae51fSJens Axboe static int __init blk_mq_init(void)
2322320ae51fSJens Axboe {
2323320ae51fSJens Axboe 	blk_mq_cpu_init();
2324320ae51fSJens Axboe 
2325add703fdSTejun Heo 	hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2326320ae51fSJens Axboe 
2327320ae51fSJens Axboe 	return 0;
2328320ae51fSJens Axboe }
2329320ae51fSJens Axboe subsys_initcall(blk_mq_init);
2330