1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef BLK_MQ_H
3 #define BLK_MQ_H
4
5 #include <linux/blkdev.h>
6 #include <linux/sbitmap.h>
7 #include <linux/lockdep.h>
8 #include <linux/scatterlist.h>
9 #include <linux/prefetch.h>
10 #include <linux/srcu.h>
11 #include <linux/rw_hint.h>
12 #include <linux/rwsem.h>
13
14 struct blk_mq_tags;
15 struct blk_flush_queue;
16 struct io_comp_batch;
17
18 #define BLKDEV_MIN_RQ 4
19 #define BLKDEV_DEFAULT_RQ 128
20
21 enum rq_end_io_ret {
22 RQ_END_IO_NONE,
23 RQ_END_IO_FREE,
24 };
25
26 typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
27 const struct io_comp_batch *);
28
29 /*
30 * request flags */
31 typedef __u32 __bitwise req_flags_t;
32
33 /* Keep rqf_name[] in sync with the definitions below */
34 enum rqf_flags {
35 /* drive already may have started this one */
36 __RQF_STARTED,
37 /* request for flush sequence */
38 __RQF_FLUSH_SEQ,
39 /* merge of different types, fail separately */
40 __RQF_MIXED_MERGE,
41 /* don't call prep for this one */
42 __RQF_DONTPREP,
43 /* use hctx->sched_tags */
44 __RQF_SCHED_TAGS,
45 /* use an I/O scheduler for this request */
46 __RQF_USE_SCHED,
47 /* vaguely specified driver internal error. Ignored by block layer */
48 __RQF_FAILED,
49 /* don't warn about errors */
50 __RQF_QUIET,
51 /* account into disk and partition IO statistics */
52 __RQF_IO_STAT,
53 /* runtime pm request */
54 __RQF_PM,
55 /* on IO scheduler merge hash */
56 __RQF_HASHED,
57 /* track IO completion time */
58 __RQF_STATS,
59 /* Look at ->special_vec for the actual data payload instead of the
60 bio chain. */
61 __RQF_SPECIAL_PAYLOAD,
62 /* request completion needs to be signaled to zone write plugging. */
63 __RQF_ZONE_WRITE_PLUGGING,
64 /* ->timeout has been called, don't expire again */
65 __RQF_TIMED_OUT,
66 __RQF_RESV,
67 __RQF_BITS
68 };
69
70 #define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED))
71 #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
72 #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
73 #define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP))
74 #define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
75 #define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED))
76 #define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED))
77 #define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET))
78 #define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT))
79 #define RQF_PM ((__force req_flags_t)(1 << __RQF_PM))
80 #define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED))
81 #define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS))
82 #define RQF_SPECIAL_PAYLOAD \
83 ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
84 #define RQF_ZONE_WRITE_PLUGGING \
85 ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
86 #define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
87 #define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV))
88
89 /* flags that prevent us from merging requests: */
90 #define RQF_NOMERGE_FLAGS \
91 (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
92
93 enum mq_rq_state {
94 MQ_RQ_IDLE = 0,
95 MQ_RQ_IN_FLIGHT = 1,
96 MQ_RQ_COMPLETE = 2,
97 };
98
99 /*
100 * Try to put the fields that are referenced together in the same cacheline.
101 *
102 * If you modify this structure, make sure to update blk_rq_init() and
103 * especially blk_mq_rq_ctx_init() to take care of the added fields.
104 */
105 struct request {
106 struct request_queue *q;
107 struct blk_mq_ctx *mq_ctx;
108 struct blk_mq_hw_ctx *mq_hctx;
109
110 blk_opf_t cmd_flags; /* op and common flags */
111 req_flags_t rq_flags;
112
113 int tag;
114 int internal_tag;
115
116 unsigned int timeout;
117
118 /* the following two fields are internal, NEVER access directly */
119 unsigned int __data_len; /* total data len */
120 sector_t __sector; /* sector cursor */
121
122 struct bio *bio;
123 struct bio *biotail;
124
125 union {
126 struct list_head queuelist;
127 struct request *rq_next;
128 };
129
130 struct block_device *part;
131 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
132 /* Time that the first bio started allocating this request. */
133 u64 alloc_time_ns;
134 #endif
135 /* Time that this request was allocated for this IO. */
136 u64 start_time_ns;
137 /* Time that I/O was submitted to the device. */
138 u64 io_start_time_ns;
139
140 #ifdef CONFIG_BLK_WBT
141 unsigned short wbt_flags;
142 #endif
143 /*
144 * rq sectors used for blk stats. It has the same value
145 * with blk_rq_sectors(rq), except that it never be zeroed
146 * by completion.
147 */
148 unsigned short stats_sectors;
149
150 /*
151 * Number of scatter-gather DMA addr+len pairs after
152 * physical address coalescing is performed.
153 */
154 unsigned short nr_phys_segments;
155 unsigned short nr_integrity_segments;
156
157 /*
158 * The lowest set bit for address gaps between physical segments. This
159 * provides information necessary for dma optimization opprotunities,
160 * like for testing if the segments can be coalesced against the
161 * device's iommu granule.
162 */
163 unsigned char phys_gap_bit;
164
165 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
166 struct bio_crypt_ctx *crypt_ctx;
167 struct blk_crypto_keyslot *crypt_keyslot;
168 #endif
169
170 enum mq_rq_state state;
171 atomic_t ref;
172
173 unsigned long deadline;
174
175 /*
176 * The hash is used inside the scheduler, and killed once the
177 * request reaches the dispatch list. The ipi_list is only used
178 * to queue the request for softirq completion, which is long
179 * after the request has been unhashed (and even removed from
180 * the dispatch list).
181 */
182 union {
183 struct hlist_node hash; /* merge hash */
184 struct llist_node ipi_list;
185 };
186
187 /*
188 * The rb_node is only used inside the io scheduler, requests
189 * are pruned when moved to the dispatch queue. special_vec must
190 * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be
191 * insert into an IO scheduler.
192 */
193 union {
194 struct rb_node rb_node; /* sort/lookup */
195 struct bio_vec special_vec;
196 };
197
198 /*
199 * Three pointers are available for the IO schedulers, if they need
200 * more they have to dynamically allocate it.
201 */
202 struct {
203 struct io_cq *icq;
204 void *priv[2];
205 } elv;
206
207 struct {
208 unsigned int seq;
209 rq_end_io_fn *saved_end_io;
210 } flush;
211
212 u64 fifo_time;
213
214 /*
215 * completion callback.
216 */
217 rq_end_io_fn *end_io;
218 void *end_io_data;
219 };
220
221 /*
222 * Returns a mask with all bits starting at req->phys_gap_bit set to 1.
223 */
req_phys_gap_mask(const struct request * req)224 static inline unsigned long req_phys_gap_mask(const struct request *req)
225 {
226 return ~(((1 << req->phys_gap_bit) >> 1) - 1);
227 }
228
req_op(const struct request * req)229 static inline enum req_op req_op(const struct request *req)
230 {
231 return req->cmd_flags & REQ_OP_MASK;
232 }
233
blk_rq_is_passthrough(struct request * rq)234 static inline bool blk_rq_is_passthrough(struct request *rq)
235 {
236 return blk_op_is_passthrough(rq->cmd_flags);
237 }
238
req_get_ioprio(struct request * req)239 static inline unsigned short req_get_ioprio(struct request *req)
240 {
241 if (req->bio)
242 return req->bio->bi_ioprio;
243 return 0;
244 }
245
246 #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
247
248 #define rq_dma_dir(rq) \
249 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
250
rq_list_empty(const struct rq_list * rl)251 static inline int rq_list_empty(const struct rq_list *rl)
252 {
253 return rl->head == NULL;
254 }
255
rq_list_init(struct rq_list * rl)256 static inline void rq_list_init(struct rq_list *rl)
257 {
258 rl->head = NULL;
259 rl->tail = NULL;
260 }
261
rq_list_add_tail(struct rq_list * rl,struct request * rq)262 static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
263 {
264 rq->rq_next = NULL;
265 if (rl->tail)
266 rl->tail->rq_next = rq;
267 else
268 rl->head = rq;
269 rl->tail = rq;
270 }
271
rq_list_add_head(struct rq_list * rl,struct request * rq)272 static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
273 {
274 rq->rq_next = rl->head;
275 rl->head = rq;
276 if (!rl->tail)
277 rl->tail = rq;
278 }
279
rq_list_pop(struct rq_list * rl)280 static inline struct request *rq_list_pop(struct rq_list *rl)
281 {
282 struct request *rq = rl->head;
283
284 if (rq) {
285 rl->head = rl->head->rq_next;
286 if (!rl->head)
287 rl->tail = NULL;
288 rq->rq_next = NULL;
289 }
290
291 return rq;
292 }
293
rq_list_peek(struct rq_list * rl)294 static inline struct request *rq_list_peek(struct rq_list *rl)
295 {
296 return rl->head;
297 }
298
299 #define rq_list_for_each(rl, pos) \
300 for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)
301
302 #define rq_list_for_each_safe(rl, pos, nxt) \
303 for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \
304 pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)
305
306 /**
307 * enum blk_eh_timer_return - How the timeout handler should proceed
308 * @BLK_EH_DONE: The block driver completed the command or will complete it at
309 * a later time.
310 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
311 * request to complete.
312 */
313 enum blk_eh_timer_return {
314 BLK_EH_DONE,
315 BLK_EH_RESET_TIMER,
316 };
317
318 /**
319 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
320 * block device
321 */
322 struct blk_mq_hw_ctx {
323 struct {
324 /** @lock: Protects the dispatch list. */
325 spinlock_t lock;
326 /**
327 * @dispatch: Used for requests that are ready to be
328 * dispatched to the hardware but for some reason (e.g. lack of
329 * resources) could not be sent to the hardware. As soon as the
330 * driver can send new requests, requests at this list will
331 * be sent first for a fairer dispatch.
332 */
333 struct list_head dispatch;
334 /**
335 * @state: BLK_MQ_S_* flags. Defines the state of the hw
336 * queue (active, scheduled to restart, stopped).
337 */
338 unsigned long state;
339 } ____cacheline_aligned_in_smp;
340
341 /**
342 * @run_work: Used for scheduling a hardware queue run at a later time.
343 */
344 struct delayed_work run_work;
345 /** @cpumask: Map of available CPUs where this hctx can run. */
346 cpumask_var_t cpumask;
347 /**
348 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
349 * selection from @cpumask.
350 */
351 int next_cpu;
352 /**
353 * @next_cpu_batch: Counter of how many works left in the batch before
354 * changing to the next CPU.
355 */
356 int next_cpu_batch;
357
358 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
359 unsigned long flags;
360
361 /**
362 * @sched_data: Pointer owned by the IO scheduler attached to a request
363 * queue. It's up to the IO scheduler how to use this pointer.
364 */
365 void *sched_data;
366 /**
367 * @queue: Pointer to the request queue that owns this hardware context.
368 */
369 struct request_queue *queue;
370 /** @fq: Queue of requests that need to perform a flush operation. */
371 struct blk_flush_queue *fq;
372
373 /**
374 * @driver_data: Pointer to data owned by the block driver that created
375 * this hctx
376 */
377 void *driver_data;
378
379 /**
380 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
381 * pending request in that software queue.
382 */
383 struct sbitmap ctx_map;
384
385 /**
386 * @dispatch_from: Software queue to be used when no scheduler was
387 * selected.
388 */
389 struct blk_mq_ctx *dispatch_from;
390 /**
391 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
392 * decide if the hw_queue is busy using Exponential Weighted Moving
393 * Average algorithm.
394 */
395 unsigned int dispatch_busy;
396
397 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
398 unsigned short type;
399 /** @nr_ctx: Number of software queues. */
400 unsigned short nr_ctx;
401 /** @ctxs: Array of software queues. */
402 struct blk_mq_ctx **ctxs;
403
404 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
405 spinlock_t dispatch_wait_lock;
406 /**
407 * @dispatch_wait: Waitqueue to put requests when there is no tag
408 * available at the moment, to wait for another try in the future.
409 */
410 wait_queue_entry_t dispatch_wait;
411
412 /**
413 * @wait_index: Index of next available dispatch_wait queue to insert
414 * requests.
415 */
416 atomic_t wait_index;
417
418 /**
419 * @tags: Tags owned by the block driver. A tag at this set is only
420 * assigned when a request is dispatched from a hardware queue.
421 */
422 struct blk_mq_tags *tags;
423 /**
424 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
425 * scheduler associated with a request queue, a tag is assigned when
426 * that request is allocated. Else, this member is not used.
427 */
428 struct blk_mq_tags *sched_tags;
429
430 /** @numa_node: NUMA node the storage adapter has been connected to. */
431 unsigned int numa_node;
432 /** @queue_num: Index of this hardware queue. */
433 unsigned int queue_num;
434
435 /**
436 * @nr_active: Number of active requests. Only used when a tag set is
437 * shared across request queues.
438 */
439 atomic_t nr_active;
440
441 /** @cpuhp_online: List to store request if CPU is going to die */
442 struct hlist_node cpuhp_online;
443 /** @cpuhp_dead: List to store request if some CPU die. */
444 struct hlist_node cpuhp_dead;
445 /** @kobj: Kernel object for sysfs. */
446 struct kobject kobj;
447
448 #ifdef CONFIG_BLK_DEBUG_FS
449 /**
450 * @debugfs_dir: debugfs directory for this hardware queue. Named
451 * as cpu<cpu_number>.
452 */
453 struct dentry *debugfs_dir;
454 /** @sched_debugfs_dir: debugfs directory for the scheduler. */
455 struct dentry *sched_debugfs_dir;
456 #endif
457
458 /**
459 * @hctx_list: if this hctx is not in use, this is an entry in
460 * q->unused_hctx_list.
461 */
462 struct list_head hctx_list;
463 };
464
465 /**
466 * struct blk_mq_queue_map - Map software queues to hardware queues
467 * @mq_map: CPU ID to hardware queue index map. This is an array
468 * with nr_cpu_ids elements. Each element has a value in the range
469 * [@queue_offset, @queue_offset + @nr_queues).
470 * @nr_queues: Number of hardware queues to map CPU IDs onto.
471 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
472 * driver to map each hardware queue type (enum hctx_type) onto a distinct
473 * set of hardware queues.
474 */
475 struct blk_mq_queue_map {
476 unsigned int *mq_map;
477 unsigned int nr_queues;
478 unsigned int queue_offset;
479 };
480
481 /**
482 * enum hctx_type - Type of hardware queue
483 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
484 * @HCTX_TYPE_READ: Just for READ I/O.
485 * @HCTX_TYPE_POLL: Polled I/O of any kind.
486 * @HCTX_MAX_TYPES: Number of types of hctx.
487 */
488 enum hctx_type {
489 HCTX_TYPE_DEFAULT,
490 HCTX_TYPE_READ,
491 HCTX_TYPE_POLL,
492
493 HCTX_MAX_TYPES,
494 };
495
496 /**
497 * struct blk_mq_tag_set - tag set that can be shared between request queues
498 * @ops: Pointers to functions that implement block driver behavior.
499 * @map: One or more ctx -> hctx mappings. One map exists for each
500 * hardware queue type (enum hctx_type) that the driver wishes
501 * to support. There are no restrictions on maps being of the
502 * same size, and it's perfectly legal to share maps between
503 * types.
504 * @nr_maps: Number of elements in the @map array. A number in the range
505 * [1, HCTX_MAX_TYPES].
506 * @nr_hw_queues: Number of hardware queues supported by the block driver that
507 * owns this data structure.
508 * @queue_depth: Number of tags per hardware queue, reserved tags included.
509 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
510 * allocations.
511 * @cmd_size: Number of additional bytes to allocate per request. The block
512 * driver owns these additional bytes.
513 * @numa_node: NUMA node the storage adapter has been connected to.
514 * @timeout: Request processing timeout in jiffies.
515 * @flags: Zero or more BLK_MQ_F_* flags.
516 * @driver_data: Pointer to data owned by the block driver that created this
517 * tag set.
518 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
519 * elements.
520 * @shared_tags:
521 * Shared set of tags. Has @nr_hw_queues elements. If set,
522 * shared by all @tags.
523 * @tag_list_lock: Serializes tag_list accesses.
524 * @tag_list: List of the request queues that use this tag set. See also
525 * request_queue.tag_set_list.
526 * @srcu: Use as lock when type of the request queue is blocking
527 * (BLK_MQ_F_BLOCKING).
528 * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent
529 * use-after-free when iterating tags.
530 * @update_nr_hwq_lock:
531 * Synchronize updating nr_hw_queues with add/del disk &
532 * switching elevator.
533 */
534 struct blk_mq_tag_set {
535 const struct blk_mq_ops *ops;
536 struct blk_mq_queue_map map[HCTX_MAX_TYPES];
537 unsigned int nr_maps;
538 unsigned int nr_hw_queues;
539 unsigned int queue_depth;
540 unsigned int reserved_tags;
541 unsigned int cmd_size;
542 int numa_node;
543 unsigned int timeout;
544 unsigned int flags;
545 void *driver_data;
546
547 struct blk_mq_tags **tags;
548
549 struct blk_mq_tags *shared_tags;
550
551 struct mutex tag_list_lock;
552 struct list_head tag_list;
553 struct srcu_struct *srcu;
554 struct srcu_struct tags_srcu;
555
556 struct rw_semaphore update_nr_hwq_lock;
557 };
558
559 /**
560 * struct blk_mq_queue_data - Data about a request inserted in a queue
561 *
562 * @rq: Request pointer.
563 * @last: If it is the last request in the queue.
564 */
565 struct blk_mq_queue_data {
566 struct request *rq;
567 bool last;
568 };
569
570 typedef bool (busy_tag_iter_fn)(struct request *, void *);
571
572 /**
573 * struct blk_mq_ops - Callback functions that implements block driver
574 * behaviour.
575 */
576 struct blk_mq_ops {
577 /**
578 * @queue_rq: Queue a new request from block IO.
579 */
580 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
581 const struct blk_mq_queue_data *);
582
583 /**
584 * @commit_rqs: If a driver uses bd->last to judge when to submit
585 * requests to hardware, it must define this function. In case of errors
586 * that make us stop issuing further requests, this hook serves the
587 * purpose of kicking the hardware (which the last request otherwise
588 * would have done).
589 */
590 void (*commit_rqs)(struct blk_mq_hw_ctx *);
591
592 /**
593 * @queue_rqs: Queue a list of new requests. Driver is guaranteed
594 * that each request belongs to the same queue. If the driver doesn't
595 * empty the @rqlist completely, then the rest will be queued
596 * individually by the block layer upon return.
597 */
598 void (*queue_rqs)(struct rq_list *rqlist);
599
600 /**
601 * @get_budget: Reserve budget before queue request, once .queue_rq is
602 * run, it is driver's responsibility to release the
603 * reserved budget. Also we have to handle failure case
604 * of .get_budget for avoiding I/O deadlock.
605 */
606 int (*get_budget)(struct request_queue *);
607
608 /**
609 * @put_budget: Release the reserved budget.
610 */
611 void (*put_budget)(struct request_queue *, int);
612
613 /**
614 * @set_rq_budget_token: store rq's budget token
615 */
616 void (*set_rq_budget_token)(struct request *, int);
617 /**
618 * @get_rq_budget_token: retrieve rq's budget token
619 */
620 int (*get_rq_budget_token)(struct request *);
621
622 /**
623 * @timeout: Called on request timeout.
624 */
625 enum blk_eh_timer_return (*timeout)(struct request *);
626
627 /**
628 * @poll: Called to poll for completion of a specific tag.
629 */
630 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
631
632 /**
633 * @complete: Mark the request as complete.
634 */
635 void (*complete)(struct request *);
636
637 /**
638 * @init_hctx: Called when the block layer side of a hardware queue has
639 * been set up, allowing the driver to allocate/init matching
640 * structures.
641 */
642 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
643 /**
644 * @exit_hctx: Ditto for exit/teardown.
645 */
646 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
647
648 /**
649 * @init_request: Called for every command allocated by the block layer
650 * to allow the driver to set up driver specific data.
651 *
652 * Tag greater than or equal to queue_depth is for setting up
653 * flush request.
654 */
655 int (*init_request)(struct blk_mq_tag_set *set, struct request *,
656 unsigned int, unsigned int);
657 /**
658 * @exit_request: Ditto for exit/teardown.
659 */
660 void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
661 unsigned int);
662
663 /**
664 * @cleanup_rq: Called before freeing one request which isn't completed
665 * yet, and usually for freeing the driver private data.
666 */
667 void (*cleanup_rq)(struct request *);
668
669 /**
670 * @busy: If set, returns whether or not this queue currently is busy.
671 */
672 bool (*busy)(struct request_queue *);
673
674 /**
675 * @map_queues: This allows drivers specify their own queue mapping by
676 * overriding the setup-time function that builds the mq_map.
677 */
678 void (*map_queues)(struct blk_mq_tag_set *set);
679
680 #ifdef CONFIG_BLK_DEBUG_FS
681 /**
682 * @show_rq: Used by the debugfs implementation to show driver-specific
683 * information about a request.
684 */
685 void (*show_rq)(struct seq_file *m, struct request *rq);
686 #endif
687 };
688
689 /* Keep hctx_flag_name[] in sync with the definitions below */
690 enum {
691 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
692 /*
693 * Set when this device requires underlying blk-mq device for
694 * completing IO:
695 */
696 BLK_MQ_F_STACKING = 1 << 2,
697 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
698 BLK_MQ_F_BLOCKING = 1 << 4,
699
700 /*
701 * Alloc tags on a round-robin base instead of the first available one.
702 */
703 BLK_MQ_F_TAG_RR = 1 << 5,
704
705 /*
706 * Select 'none' during queue registration in case of a single hwq
707 * or shared hwqs instead of 'mq-deadline'.
708 */
709 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6,
710
711 BLK_MQ_F_MAX = 1 << 7,
712 };
713
714 #define BLK_MQ_MAX_DEPTH (10240)
715 #define BLK_MQ_NO_HCTX_IDX (-1U)
716
717 enum {
718 /* Keep hctx_state_name[] in sync with the definitions below */
719 BLK_MQ_S_STOPPED,
720 BLK_MQ_S_TAG_ACTIVE,
721 BLK_MQ_S_SCHED_RESTART,
722 /* hw queue is inactive after all its CPUs become offline */
723 BLK_MQ_S_INACTIVE,
724 BLK_MQ_S_MAX
725 };
726
727 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
728 struct queue_limits *lim, void *queuedata,
729 struct lock_class_key *lkclass);
730 #define blk_mq_alloc_disk(set, lim, queuedata) \
731 ({ \
732 static struct lock_class_key __key; \
733 \
734 __blk_mq_alloc_disk(set, lim, queuedata, &__key); \
735 })
736 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
737 struct lock_class_key *lkclass);
738 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
739 struct queue_limits *lim, void *queuedata);
740 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
741 struct request_queue *q);
742 void blk_mq_destroy_queue(struct request_queue *);
743
744 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
745 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
746 const struct blk_mq_ops *ops, unsigned int queue_depth,
747 unsigned int set_flags);
748 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
749
750 void blk_mq_free_request(struct request *rq);
751 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
752 unsigned int poll_flags);
753
754 bool blk_mq_queue_inflight(struct request_queue *q);
755
756 enum {
757 /* return when out of requests */
758 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0),
759 /* allocate from reserved pool */
760 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
761 /* set RQF_PM */
762 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2),
763 };
764
765 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
766 blk_mq_req_flags_t flags);
767 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
768 blk_opf_t opf, blk_mq_req_flags_t flags,
769 unsigned int hctx_idx);
770
771 /*
772 * Tag address space map.
773 */
774 struct blk_mq_tags {
775 unsigned int nr_tags;
776 unsigned int nr_reserved_tags;
777 unsigned int active_queues;
778
779 struct sbitmap_queue bitmap_tags;
780 struct sbitmap_queue breserved_tags;
781
782 struct request **rqs;
783 struct request **static_rqs;
784 struct list_head page_list;
785
786 /*
787 * used to clear request reference in rqs[] before freeing one
788 * request pool
789 */
790 spinlock_t lock;
791 struct rcu_head rcu_head;
792 };
793
blk_mq_tag_to_rq(struct blk_mq_tags * tags,unsigned int tag)794 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
795 unsigned int tag)
796 {
797 if (tag < tags->nr_tags) {
798 prefetch(tags->rqs[tag]);
799 return tags->rqs[tag];
800 }
801
802 return NULL;
803 }
804
805 enum {
806 BLK_MQ_UNIQUE_TAG_BITS = 16,
807 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
808 };
809
810 u32 blk_mq_unique_tag(struct request *rq);
811
blk_mq_unique_tag_to_hwq(u32 unique_tag)812 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
813 {
814 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
815 }
816
blk_mq_unique_tag_to_tag(u32 unique_tag)817 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
818 {
819 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
820 }
821
822 /**
823 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
824 * @rq: target request.
825 */
blk_mq_rq_state(struct request * rq)826 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
827 {
828 return READ_ONCE(rq->state);
829 }
830
blk_mq_request_started(struct request * rq)831 static inline int blk_mq_request_started(struct request *rq)
832 {
833 return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
834 }
835
blk_mq_request_completed(struct request * rq)836 static inline int blk_mq_request_completed(struct request *rq)
837 {
838 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
839 }
840
841 /*
842 *
843 * Set the state to complete when completing a request from inside ->queue_rq.
844 * This is used by drivers that want to ensure special complete actions that
845 * need access to the request are called on failure, e.g. by nvme for
846 * multipathing.
847 */
blk_mq_set_request_complete(struct request * rq)848 static inline void blk_mq_set_request_complete(struct request *rq)
849 {
850 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
851 }
852
853 /*
854 * Complete the request directly instead of deferring it to softirq or
855 * completing it another CPU. Useful in preemptible instead of an interrupt.
856 */
blk_mq_complete_request_direct(struct request * rq,void (* complete)(struct request * rq))857 static inline void blk_mq_complete_request_direct(struct request *rq,
858 void (*complete)(struct request *rq))
859 {
860 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
861 complete(rq);
862 }
863
864 void blk_mq_start_request(struct request *rq);
865 void blk_mq_end_request(struct request *rq, blk_status_t error);
866 void __blk_mq_end_request(struct request *rq, blk_status_t error);
867 void blk_mq_end_request_batch(struct io_comp_batch *ib);
868
869 /*
870 * Only need start/end time stamping if we have iostat or
871 * blk stats enabled, or using an IO scheduler.
872 */
blk_mq_need_time_stamp(struct request * rq)873 static inline bool blk_mq_need_time_stamp(struct request *rq)
874 {
875 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
876 }
877
blk_mq_is_reserved_rq(struct request * rq)878 static inline bool blk_mq_is_reserved_rq(struct request *rq)
879 {
880 return rq->rq_flags & RQF_RESV;
881 }
882
883 /**
884 * blk_mq_add_to_batch() - add a request to the completion batch
885 * @req: The request to add to batch
886 * @iob: The batch to add the request
887 * @is_error: Specify true if the request failed with an error
888 * @complete: The completaion handler for the request
889 *
890 * Batched completions only work when there is no I/O error and no special
891 * ->end_io handler.
892 *
893 * Return: true when the request was added to the batch, otherwise false
894 */
blk_mq_add_to_batch(struct request * req,struct io_comp_batch * iob,bool is_error,void (* complete)(struct io_comp_batch *))895 static inline bool blk_mq_add_to_batch(struct request *req,
896 struct io_comp_batch *iob, bool is_error,
897 void (*complete)(struct io_comp_batch *))
898 {
899 /*
900 * Check various conditions that exclude batch processing:
901 * 1) No batch container
902 * 2) Has scheduler data attached
903 * 3) Not a passthrough request and end_io set
904 * 4) Not a passthrough request and failed with an error
905 */
906 if (!iob)
907 return false;
908 if (req->rq_flags & RQF_SCHED_TAGS)
909 return false;
910 if (!blk_rq_is_passthrough(req)) {
911 if (req->end_io)
912 return false;
913 if (is_error)
914 return false;
915 }
916
917 if (!iob->complete)
918 iob->complete = complete;
919 else if (iob->complete != complete)
920 return false;
921 iob->need_ts |= blk_mq_need_time_stamp(req);
922 rq_list_add_tail(&iob->req_list, req);
923 return true;
924 }
925
926 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
927 void blk_mq_kick_requeue_list(struct request_queue *q);
928 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
929 void blk_mq_complete_request(struct request *rq);
930 bool blk_mq_complete_request_remote(struct request *rq);
931 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
932 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
933 void blk_mq_stop_hw_queues(struct request_queue *q);
934 void blk_mq_start_hw_queues(struct request_queue *q);
935 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
936 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
937 void blk_mq_quiesce_queue(struct request_queue *q);
938 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
939 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
940 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
941 void blk_mq_unquiesce_queue(struct request_queue *q);
942 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
943 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
944 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
945 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
946 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
947 busy_tag_iter_fn *fn, void *priv);
948 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
949 void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
950 void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
951 static inline unsigned int __must_check
blk_mq_freeze_queue(struct request_queue * q)952 blk_mq_freeze_queue(struct request_queue *q)
953 {
954 unsigned int memflags = memalloc_noio_save();
955
956 blk_mq_freeze_queue_nomemsave(q);
957 return memflags;
958 }
959 static inline void
blk_mq_unfreeze_queue(struct request_queue * q,unsigned int memflags)960 blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags)
961 {
962 blk_mq_unfreeze_queue_nomemrestore(q);
963 memalloc_noio_restore(memflags);
964 }
965 void blk_freeze_queue_start(struct request_queue *q);
966 void blk_mq_freeze_queue_wait(struct request_queue *q);
967 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
968 unsigned long timeout);
969 void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
970 void blk_freeze_queue_start_non_owner(struct request_queue *q);
971
972 unsigned int blk_mq_num_possible_queues(unsigned int max_queues);
973 unsigned int blk_mq_num_online_queues(unsigned int max_queues);
974 void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
975 void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
976 struct device *dev, unsigned int offset);
977 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
978
979 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
980
981 unsigned int blk_mq_rq_cpu(struct request *rq);
982
983 bool __blk_should_fake_timeout(struct request_queue *q);
blk_should_fake_timeout(struct request_queue * q)984 static inline bool blk_should_fake_timeout(struct request_queue *q)
985 {
986 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
987 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
988 return __blk_should_fake_timeout(q);
989 return false;
990 }
991
992 /**
993 * blk_mq_rq_from_pdu - cast a PDU to a request
994 * @pdu: the PDU (Protocol Data Unit) to be casted
995 *
996 * Return: request
997 *
998 * Driver command data is immediately after the request. So subtract request
999 * size to get back to the original request.
1000 */
blk_mq_rq_from_pdu(void * pdu)1001 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
1002 {
1003 return pdu - sizeof(struct request);
1004 }
1005
1006 /**
1007 * blk_mq_rq_to_pdu - cast a request to a PDU
1008 * @rq: the request to be casted
1009 *
1010 * Return: pointer to the PDU
1011 *
1012 * Driver command data is immediately after the request. So add request to get
1013 * the PDU.
1014 */
blk_mq_rq_to_pdu(struct request * rq)1015 static inline void *blk_mq_rq_to_pdu(struct request *rq)
1016 {
1017 return rq + 1;
1018 }
1019
queue_hctx(struct request_queue * q,int id)1020 static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
1021 {
1022 struct blk_mq_hw_ctx *hctx;
1023
1024 rcu_read_lock();
1025 hctx = rcu_dereference(q->queue_hw_ctx)[id];
1026 rcu_read_unlock();
1027
1028 return hctx;
1029 }
1030
1031 #define queue_for_each_hw_ctx(q, hctx, i) \
1032 for ((i) = 0; (i) < (q)->nr_hw_queues && \
1033 ({ hctx = queue_hctx((q), i); 1; }); (i)++)
1034
1035 #define hctx_for_each_ctx(hctx, ctx, i) \
1036 for ((i) = 0; (i) < (hctx)->nr_ctx && \
1037 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
1038
blk_mq_cleanup_rq(struct request * rq)1039 static inline void blk_mq_cleanup_rq(struct request *rq)
1040 {
1041 if (rq->q->mq_ops->cleanup_rq)
1042 rq->q->mq_ops->cleanup_rq(rq);
1043 }
1044
1045 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
1046 struct lock_class_key *key);
1047
rq_is_sync(struct request * rq)1048 static inline bool rq_is_sync(struct request *rq)
1049 {
1050 return op_is_sync(rq->cmd_flags);
1051 }
1052
1053 void blk_rq_init(struct request_queue *q, struct request *rq);
1054 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
1055 struct bio_set *bs, gfp_t gfp_mask,
1056 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
1057 void blk_rq_unprep_clone(struct request *rq);
1058 blk_status_t blk_insert_cloned_request(struct request *rq);
1059
1060 struct rq_map_data {
1061 struct page **pages;
1062 unsigned long offset;
1063 unsigned short page_order;
1064 unsigned short nr_entries;
1065 bool null_mapped;
1066 bool from_user;
1067 };
1068
1069 int blk_rq_map_user(struct request_queue *, struct request *,
1070 struct rq_map_data *, void __user *, unsigned long, gfp_t);
1071 int blk_rq_map_user_io(struct request *, struct rq_map_data *,
1072 void __user *, unsigned long, gfp_t, bool, int, bool, int);
1073 int blk_rq_map_user_iov(struct request_queue *, struct request *,
1074 struct rq_map_data *, const struct iov_iter *, gfp_t);
1075 int blk_rq_unmap_user(struct bio *);
1076 int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
1077 gfp_t gfp);
1078 int blk_rq_append_bio(struct request *rq, struct bio *bio);
1079 void blk_execute_rq_nowait(struct request *rq, bool at_head);
1080 blk_status_t blk_execute_rq(struct request *rq, bool at_head);
1081 bool blk_rq_is_poll(struct request *rq);
1082
1083 struct req_iterator {
1084 struct bvec_iter iter;
1085 struct bio *bio;
1086 };
1087
1088 #define __rq_for_each_bio(_bio, rq) \
1089 if ((rq->bio)) \
1090 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
1091
1092 #define rq_for_each_segment(bvl, _rq, _iter) \
1093 __rq_for_each_bio(_iter.bio, _rq) \
1094 bio_for_each_segment(bvl, _iter.bio, _iter.iter)
1095
1096 #define rq_for_each_bvec(bvl, _rq, _iter) \
1097 __rq_for_each_bio(_iter.bio, _rq) \
1098 bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
1099
1100 #define rq_iter_last(bvec, _iter) \
1101 (_iter.bio->bi_next == NULL && \
1102 bio_iter_last(bvec, _iter.iter))
1103
1104 /*
1105 * blk_rq_pos() : the current sector
1106 * blk_rq_bytes() : bytes left in the entire request
1107 * blk_rq_cur_bytes() : bytes left in the current segment
1108 * blk_rq_sectors() : sectors left in the entire request
1109 * blk_rq_cur_sectors() : sectors left in the current segment
1110 * blk_rq_stats_sectors() : sectors of the entire request used for stats
1111 */
blk_rq_pos(const struct request * rq)1112 static inline sector_t blk_rq_pos(const struct request *rq)
1113 {
1114 return rq->__sector;
1115 }
1116
blk_rq_bytes(const struct request * rq)1117 static inline unsigned int blk_rq_bytes(const struct request *rq)
1118 {
1119 return rq->__data_len;
1120 }
1121
blk_rq_cur_bytes(const struct request * rq)1122 static inline int blk_rq_cur_bytes(const struct request *rq)
1123 {
1124 if (!rq->bio)
1125 return 0;
1126 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */
1127 return rq->bio->bi_iter.bi_size;
1128 return bio_iovec(rq->bio).bv_len;
1129 }
1130
blk_rq_sectors(const struct request * rq)1131 static inline unsigned int blk_rq_sectors(const struct request *rq)
1132 {
1133 return blk_rq_bytes(rq) >> SECTOR_SHIFT;
1134 }
1135
blk_rq_cur_sectors(const struct request * rq)1136 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
1137 {
1138 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
1139 }
1140
blk_rq_stats_sectors(const struct request * rq)1141 static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
1142 {
1143 return rq->stats_sectors;
1144 }
1145
1146 /*
1147 * Some commands like WRITE SAME have a payload or data transfer size which
1148 * is different from the size of the request. Any driver that supports such
1149 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
1150 * calculate the data transfer size.
1151 */
blk_rq_payload_bytes(struct request * rq)1152 static inline unsigned int blk_rq_payload_bytes(struct request *rq)
1153 {
1154 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1155 return rq->special_vec.bv_len;
1156 return blk_rq_bytes(rq);
1157 }
1158
1159 /*
1160 * Return the first full biovec in the request. The caller needs to check that
1161 * there are any bvecs before calling this helper.
1162 */
req_bvec(struct request * rq)1163 static inline struct bio_vec req_bvec(struct request *rq)
1164 {
1165 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1166 return rq->special_vec;
1167 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
1168 }
1169
blk_rq_count_bios(struct request * rq)1170 static inline unsigned int blk_rq_count_bios(struct request *rq)
1171 {
1172 unsigned int nr_bios = 0;
1173 struct bio *bio;
1174
1175 __rq_for_each_bio(bio, rq)
1176 nr_bios++;
1177
1178 return nr_bios;
1179 }
1180
1181 void blk_steal_bios(struct bio_list *list, struct request *rq);
1182
1183 /*
1184 * Request completion related functions.
1185 *
1186 * blk_update_request() completes given number of bytes and updates
1187 * the request without completing it.
1188 */
1189 bool blk_update_request(struct request *rq, blk_status_t error,
1190 unsigned int nr_bytes);
1191 void blk_abort_request(struct request *);
1192
1193 /*
1194 * Number of physical segments as sent to the device.
1195 *
1196 * Normally this is the number of discontiguous data segments sent by the
1197 * submitter. But for data-less command like discard we might have no
1198 * actual data segments submitted, but the driver might have to add it's
1199 * own special payload. In that case we still return 1 here so that this
1200 * special payload will be mapped.
1201 */
blk_rq_nr_phys_segments(struct request * rq)1202 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
1203 {
1204 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1205 return 1;
1206 return rq->nr_phys_segments;
1207 }
1208
1209 /*
1210 * Number of discard segments (or ranges) the driver needs to fill in.
1211 * Each discard bio merged into a request is counted as one segment.
1212 */
blk_rq_nr_discard_segments(struct request * rq)1213 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
1214 {
1215 return max_t(unsigned short, rq->nr_phys_segments, 1);
1216 }
1217
1218 /**
1219 * blk_rq_nr_bvec - return number of bvecs in a request
1220 * @rq: request to calculate bvecs for
1221 *
1222 * Returns the number of bvecs.
1223 */
blk_rq_nr_bvec(struct request * rq)1224 static inline unsigned int blk_rq_nr_bvec(struct request *rq)
1225 {
1226 struct req_iterator rq_iter;
1227 struct bio_vec bv;
1228 unsigned int nr_bvec = 0;
1229
1230 rq_for_each_bvec(bv, rq, rq_iter)
1231 nr_bvec++;
1232
1233 return nr_bvec;
1234 }
1235
1236 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
1237 struct scatterlist **last_sg);
blk_rq_map_sg(struct request * rq,struct scatterlist * sglist)1238 static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist)
1239 {
1240 struct scatterlist *last_sg = NULL;
1241
1242 return __blk_rq_map_sg(rq, sglist, &last_sg);
1243 }
1244 void blk_dump_rq_flags(struct request *, char *);
1245
1246 #endif /* BLK_MQ_H */
1247