xref: /linux/block/blk-mq-dma.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2025 Christoph Hellwig
4  */
5 #include <linux/blk-mq-dma.h>
6 #include "blk.h"
7 
8 struct phys_vec {
9 	phys_addr_t	paddr;
10 	u32		len;
11 };
12 
blk_map_iter_next(struct request * req,struct req_iterator * iter,struct phys_vec * vec)13 static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
14 			      struct phys_vec *vec)
15 {
16 	unsigned int max_size;
17 	struct bio_vec bv;
18 
19 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
20 		if (!iter->bio)
21 			return false;
22 		vec->paddr = bvec_phys(&req->special_vec);
23 		vec->len = req->special_vec.bv_len;
24 		iter->bio = NULL;
25 		return true;
26 	}
27 
28 	if (!iter->iter.bi_size)
29 		return false;
30 
31 	bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
32 	vec->paddr = bvec_phys(&bv);
33 	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
34 	bv.bv_len = min(bv.bv_len, max_size);
35 	bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
36 
37 	/*
38 	 * If we are entirely done with this bi_io_vec entry, check if the next
39 	 * one could be merged into it.  This typically happens when moving to
40 	 * the next bio, but some callers also don't pack bvecs tight.
41 	 */
42 	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
43 		struct bio_vec next;
44 
45 		if (!iter->iter.bi_size) {
46 			if (!iter->bio->bi_next)
47 				break;
48 			iter->bio = iter->bio->bi_next;
49 			iter->iter = iter->bio->bi_iter;
50 		}
51 
52 		next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
53 		if (bv.bv_len + next.bv_len > max_size ||
54 		    !biovec_phys_mergeable(req->q, &bv, &next))
55 			break;
56 
57 		bv.bv_len += next.bv_len;
58 		bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
59 	}
60 
61 	vec->len = bv.bv_len;
62 	return true;
63 }
64 
65 /*
66  * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
67  * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
68  * we need to ensure our segments are aligned to this as well.
69  *
70  * Note that there is no point in using the slightly more complicated IOVA based
71  * path for single segment mappings.
72  */
blk_can_dma_map_iova(struct request * req,struct device * dma_dev)73 static inline bool blk_can_dma_map_iova(struct request *req,
74 		struct device *dma_dev)
75 {
76 	return !((queue_virt_boundary(req->q) + 1) &
77 		dma_get_merge_boundary(dma_dev));
78 }
79 
blk_dma_map_bus(struct blk_dma_iter * iter,struct phys_vec * vec)80 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
81 {
82 	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
83 	iter->len = vec->len;
84 	return true;
85 }
86 
blk_dma_map_direct(struct request * req,struct device * dma_dev,struct blk_dma_iter * iter,struct phys_vec * vec)87 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
88 		struct blk_dma_iter *iter, struct phys_vec *vec)
89 {
90 	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
91 			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
92 	if (dma_mapping_error(dma_dev, iter->addr)) {
93 		iter->status = BLK_STS_RESOURCE;
94 		return false;
95 	}
96 	iter->len = vec->len;
97 	return true;
98 }
99 
blk_rq_dma_map_iova(struct request * req,struct device * dma_dev,struct dma_iova_state * state,struct blk_dma_iter * iter,struct phys_vec * vec)100 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
101 		struct dma_iova_state *state, struct blk_dma_iter *iter,
102 		struct phys_vec *vec)
103 {
104 	enum dma_data_direction dir = rq_dma_dir(req);
105 	unsigned int mapped = 0;
106 	int error;
107 
108 	iter->addr = state->addr;
109 	iter->len = dma_iova_size(state);
110 
111 	do {
112 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
113 				vec->len, dir, 0);
114 		if (error)
115 			break;
116 		mapped += vec->len;
117 	} while (blk_map_iter_next(req, &iter->iter, vec));
118 
119 	error = dma_iova_sync(dma_dev, state, 0, mapped);
120 	if (error) {
121 		iter->status = errno_to_blk_status(error);
122 		return false;
123 	}
124 
125 	return true;
126 }
127 
128 /**
129  * blk_rq_dma_map_iter_start - map the first DMA segment for a request
130  * @req:	request to map
131  * @dma_dev:	device to map to
132  * @state:	DMA IOVA state
133  * @iter:	block layer DMA iterator
134  *
135  * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
136  * caller and don't need to be initialized.  @state needs to be stored for use
137  * at unmap time, @iter is only needed at map time.
138  *
139  * Returns %false if there is no segment to map, including due to an error, or
140  * %true ft it did map a segment.
141  *
142  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
143  * the length in @iter.len.  If no segment was mapped the status code is
144  * returned in @iter.status.
145  *
146  * The caller can call blk_rq_dma_map_coalesce() to check if further segments
147  * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
148  * to try to map the following segments.
149  */
blk_rq_dma_map_iter_start(struct request * req,struct device * dma_dev,struct dma_iova_state * state,struct blk_dma_iter * iter)150 bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
151 		struct dma_iova_state *state, struct blk_dma_iter *iter)
152 {
153 	unsigned int total_len = blk_rq_payload_bytes(req);
154 	struct phys_vec vec;
155 
156 	iter->iter.bio = req->bio;
157 	iter->iter.iter = req->bio->bi_iter;
158 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
159 	iter->status = BLK_STS_OK;
160 
161 	/*
162 	 * Grab the first segment ASAP because we'll need it to check for P2P
163 	 * transfers.
164 	 */
165 	if (!blk_map_iter_next(req, &iter->iter, &vec))
166 		return false;
167 
168 	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
169 		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
170 					 phys_to_page(vec.paddr))) {
171 		case PCI_P2PDMA_MAP_BUS_ADDR:
172 			return blk_dma_map_bus(iter, &vec);
173 		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
174 			/*
175 			 * P2P transfers through the host bridge are treated the
176 			 * same as non-P2P transfers below and during unmap.
177 			 */
178 			req->cmd_flags &= ~REQ_P2PDMA;
179 			break;
180 		default:
181 			iter->status = BLK_STS_INVAL;
182 			return false;
183 		}
184 	}
185 
186 	if (blk_can_dma_map_iova(req, dma_dev) &&
187 	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
188 		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
189 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
190 }
191 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
192 
193 /**
194  * blk_rq_dma_map_iter_next - map the next DMA segment for a request
195  * @req:	request to map
196  * @dma_dev:	device to map to
197  * @state:	DMA IOVA state
198  * @iter:	block layer DMA iterator
199  *
200  * Iterate to the next mapping after a previous call to
201  * blk_rq_dma_map_iter_start().  See there for a detailed description of the
202  * arguments.
203  *
204  * Returns %false if there is no segment to map, including due to an error, or
205  * %true ft it did map a segment.
206  *
207  * If a segment was mapped, the DMA address for it is returned in @iter.addr and
208  * the length in @iter.len.  If no segment was mapped the status code is
209  * returned in @iter.status.
210  */
blk_rq_dma_map_iter_next(struct request * req,struct device * dma_dev,struct dma_iova_state * state,struct blk_dma_iter * iter)211 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
212 		struct dma_iova_state *state, struct blk_dma_iter *iter)
213 {
214 	struct phys_vec vec;
215 
216 	if (!blk_map_iter_next(req, &iter->iter, &vec))
217 		return false;
218 
219 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
220 		return blk_dma_map_bus(iter, &vec);
221 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
222 }
223 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
224 
225 static inline struct scatterlist *
blk_next_sg(struct scatterlist ** sg,struct scatterlist * sglist)226 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
227 {
228 	if (!*sg)
229 		return sglist;
230 
231 	/*
232 	 * If the driver previously mapped a shorter list, we could see a
233 	 * termination bit prematurely unless it fully inits the sg table
234 	 * on each mapping. We KNOW that there must be more entries here
235 	 * or the driver would be buggy, so force clear the termination bit
236 	 * to avoid doing a full sg_init_table() in drivers for each command.
237 	 */
238 	sg_unmark_end(*sg);
239 	return sg_next(*sg);
240 }
241 
242 /*
243  * Map a request to scatterlist, return number of sg entries setup. Caller
244  * must make sure sg can hold rq->nr_phys_segments entries.
245  */
__blk_rq_map_sg(struct request * rq,struct scatterlist * sglist,struct scatterlist ** last_sg)246 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
247 		    struct scatterlist **last_sg)
248 {
249 	struct req_iterator iter = {
250 		.bio	= rq->bio,
251 	};
252 	struct phys_vec vec;
253 	int nsegs = 0;
254 
255 	/* the internal flush request may not have bio attached */
256 	if (iter.bio)
257 		iter.iter = iter.bio->bi_iter;
258 
259 	while (blk_map_iter_next(rq, &iter, &vec)) {
260 		*last_sg = blk_next_sg(last_sg, sglist);
261 		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
262 				offset_in_page(vec.paddr));
263 		nsegs++;
264 	}
265 
266 	if (*last_sg)
267 		sg_mark_end(*last_sg);
268 
269 	/*
270 	 * Something must have been wrong if the figured number of
271 	 * segment is bigger than number of req's physical segments
272 	 */
273 	WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
274 
275 	return nsegs;
276 }
277 EXPORT_SYMBOL(__blk_rq_map_sg);
278