1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/fsverity.h>
4 #include <linux/iomap.h>
5 #include "ctree.h"
6 #include "delalloc-space.h"
7 #include "direct-io.h"
8 #include "extent-tree.h"
9 #include "file.h"
10 #include "fs.h"
11 #include "transaction.h"
12 #include "volumes.h"
13 #include "bio.h"
14 #include "ordered-data.h"
15
16 struct btrfs_dio_data {
17 ssize_t submitted;
18 struct extent_changeset *data_reserved;
19 struct btrfs_ordered_extent *ordered;
20 bool data_space_reserved;
21 bool nocow_done;
22 };
23
24 struct btrfs_dio_private {
25 /* Range of I/O */
26 u64 file_offset;
27 u32 bytes;
28
29 /* This must be last */
30 struct btrfs_bio bbio;
31 };
32
33 static struct bio_set btrfs_dio_bioset;
34
lock_extent_direct(struct inode * inode,u64 lockstart,u64 lockend,struct extent_state ** cached_state,unsigned int iomap_flags)35 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
36 struct extent_state **cached_state,
37 unsigned int iomap_flags)
38 {
39 const bool writing = (iomap_flags & IOMAP_WRITE);
40 const bool nowait = (iomap_flags & IOMAP_NOWAIT);
41 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
42 struct btrfs_ordered_extent *ordered;
43 int ret = 0;
44
45 /* Direct lock must be taken before the extent lock. */
46 if (nowait) {
47 if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
48 return -EAGAIN;
49 } else {
50 btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
51 }
52
53 while (1) {
54 if (nowait) {
55 if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
56 cached_state)) {
57 ret = -EAGAIN;
58 break;
59 }
60 } else {
61 btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
62 }
63 /*
64 * We're concerned with the entire range that we're going to be
65 * doing DIO to, so we need to make sure there's no ordered
66 * extents in this range.
67 */
68 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
69 lockend - lockstart + 1);
70
71 /*
72 * We need to make sure there are no buffered pages in this
73 * range either, we could have raced between the invalidate in
74 * generic_file_direct_write and locking the extent. The
75 * invalidate needs to happen so that reads after a write do not
76 * get stale data.
77 */
78 if (!ordered &&
79 (!writing || !filemap_range_has_page(inode->i_mapping,
80 lockstart, lockend)))
81 break;
82
83 btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
84
85 if (ordered) {
86 if (nowait) {
87 btrfs_put_ordered_extent(ordered);
88 ret = -EAGAIN;
89 break;
90 }
91 /*
92 * If we are doing a DIO read and the ordered extent we
93 * found is for a buffered write, we can not wait for it
94 * to complete and retry, because if we do so we can
95 * deadlock with concurrent buffered writes on page
96 * locks. This happens only if our DIO read covers more
97 * than one extent map, if at this point has already
98 * created an ordered extent for a previous extent map
99 * and locked its range in the inode's io tree, and a
100 * concurrent write against that previous extent map's
101 * range and this range started (we unlock the ranges
102 * in the io tree only when the bios complete and
103 * buffered writes always lock pages before attempting
104 * to lock range in the io tree).
105 */
106 if (writing ||
107 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
108 btrfs_start_ordered_extent(ordered);
109 else
110 ret = nowait ? -EAGAIN : -ENOTBLK;
111 btrfs_put_ordered_extent(ordered);
112 } else {
113 /*
114 * We could trigger writeback for this range (and wait
115 * for it to complete) and then invalidate the pages for
116 * this range (through invalidate_inode_pages2_range()),
117 * but that can lead us to a deadlock with a concurrent
118 * call to readahead (a buffered read or a defrag call
119 * triggered a readahead) on a page lock due to an
120 * ordered dio extent we created before but did not have
121 * yet a corresponding bio submitted (whence it can not
122 * complete), which makes readahead wait for that
123 * ordered extent to complete while holding a lock on
124 * that page.
125 */
126 ret = nowait ? -EAGAIN : -ENOTBLK;
127 }
128
129 if (ret)
130 break;
131
132 cond_resched();
133 }
134
135 if (ret)
136 btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
137 return ret;
138 }
139
btrfs_create_dio_extent(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,const u64 start,const struct btrfs_file_extent * file_extent,const int type)140 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
141 struct btrfs_dio_data *dio_data,
142 const u64 start,
143 const struct btrfs_file_extent *file_extent,
144 const int type)
145 {
146 struct extent_map *em = NULL;
147 struct btrfs_ordered_extent *ordered;
148
149 if (type != BTRFS_ORDERED_NOCOW) {
150 em = btrfs_create_io_em(inode, start, file_extent, type);
151 if (IS_ERR(em))
152 goto out;
153 }
154
155 ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
156 (1U << type) |
157 (1U << BTRFS_ORDERED_DIRECT));
158 if (IS_ERR(ordered)) {
159 if (em) {
160 btrfs_free_extent_map(em);
161 btrfs_drop_extent_map_range(inode, start,
162 start + file_extent->num_bytes - 1, false);
163 }
164 em = ERR_CAST(ordered);
165 } else {
166 ASSERT(!dio_data->ordered);
167 dio_data->ordered = ordered;
168 }
169 out:
170
171 return em;
172 }
173
btrfs_new_extent_direct(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 len)174 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
175 struct btrfs_dio_data *dio_data,
176 u64 start, u64 len)
177 {
178 struct btrfs_root *root = inode->root;
179 struct btrfs_fs_info *fs_info = root->fs_info;
180 struct btrfs_file_extent file_extent;
181 struct extent_map *em;
182 struct btrfs_key ins;
183 u64 alloc_hint;
184 int ret;
185
186 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
187 again:
188 ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
189 0, alloc_hint, &ins, true, true);
190 if (ret == -EAGAIN) {
191 ASSERT(btrfs_is_zoned(fs_info));
192 wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
193 TASK_UNINTERRUPTIBLE);
194 goto again;
195 }
196 if (ret)
197 return ERR_PTR(ret);
198
199 file_extent.disk_bytenr = ins.objectid;
200 file_extent.disk_num_bytes = ins.offset;
201 file_extent.num_bytes = ins.offset;
202 file_extent.ram_bytes = ins.offset;
203 file_extent.offset = 0;
204 file_extent.compression = BTRFS_COMPRESS_NONE;
205 em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
206 BTRFS_ORDERED_REGULAR);
207 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
208 if (IS_ERR(em))
209 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
210
211 return em;
212 }
213
btrfs_get_blocks_direct_write(struct extent_map ** map,struct inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 * lenp,unsigned int iomap_flags)214 static int btrfs_get_blocks_direct_write(struct extent_map **map,
215 struct inode *inode,
216 struct btrfs_dio_data *dio_data,
217 u64 start, u64 *lenp,
218 unsigned int iomap_flags)
219 {
220 const bool nowait = (iomap_flags & IOMAP_NOWAIT);
221 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
222 struct btrfs_file_extent file_extent;
223 struct extent_map *em = *map;
224 int type;
225 u64 block_start;
226 struct btrfs_block_group *bg;
227 bool can_nocow = false;
228 bool space_reserved = false;
229 u64 len = *lenp;
230 u64 prev_len;
231 int ret = 0;
232
233 /*
234 * We don't allocate a new extent in the following cases
235 *
236 * 1) The inode is marked as NODATACOW. In this case we'll just use the
237 * existing extent.
238 * 2) The extent is marked as PREALLOC. We're good to go here and can
239 * just use the extent.
240 *
241 */
242 if ((em->flags & EXTENT_FLAG_PREALLOC) ||
243 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
244 em->disk_bytenr != EXTENT_MAP_HOLE)) {
245 if (em->flags & EXTENT_FLAG_PREALLOC)
246 type = BTRFS_ORDERED_PREALLOC;
247 else
248 type = BTRFS_ORDERED_NOCOW;
249 len = min(len, em->len - (start - em->start));
250 block_start = btrfs_extent_map_block_start(em) + (start - em->start);
251
252 if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
253 false) == 1) {
254 bg = btrfs_inc_nocow_writers(fs_info, block_start);
255 if (bg)
256 can_nocow = true;
257 }
258 }
259
260 prev_len = len;
261 if (can_nocow) {
262 struct extent_map *em2;
263
264 /* We can NOCOW, so only need to reserve metadata space. */
265 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
266 nowait);
267 if (ret < 0) {
268 /* Our caller expects us to free the input extent map. */
269 btrfs_free_extent_map(em);
270 *map = NULL;
271 btrfs_dec_nocow_writers(bg);
272 if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
273 ret = -EAGAIN;
274 goto out;
275 }
276 space_reserved = true;
277
278 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
279 &file_extent, type);
280 btrfs_dec_nocow_writers(bg);
281 if (type == BTRFS_ORDERED_PREALLOC) {
282 btrfs_free_extent_map(em);
283 *map = em2;
284 em = em2;
285 }
286
287 if (IS_ERR(em2)) {
288 ret = PTR_ERR(em2);
289 goto out;
290 }
291
292 dio_data->nocow_done = true;
293 } else {
294 /* Our caller expects us to free the input extent map. */
295 btrfs_free_extent_map(em);
296 *map = NULL;
297
298 if (nowait) {
299 ret = -EAGAIN;
300 goto out;
301 }
302
303 /*
304 * If we could not allocate data space before locking the file
305 * range and we can't do a NOCOW write, then we have to fail.
306 */
307 if (!dio_data->data_space_reserved) {
308 ret = -ENOSPC;
309 goto out;
310 }
311
312 /*
313 * We have to COW and we have already reserved data space before,
314 * so now we reserve only metadata.
315 */
316 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
317 false);
318 if (ret < 0)
319 goto out;
320 space_reserved = true;
321
322 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
323 if (IS_ERR(em)) {
324 ret = PTR_ERR(em);
325 goto out;
326 }
327 *map = em;
328 len = min(len, em->len - (start - em->start));
329 if (len < prev_len)
330 btrfs_delalloc_release_metadata(BTRFS_I(inode),
331 prev_len - len, true);
332 }
333
334 /*
335 * We have created our ordered extent, so we can now release our reservation
336 * for an outstanding extent.
337 */
338 btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
339
340 /*
341 * Need to update the i_size under the extent lock so buffered
342 * readers will get the updated i_size when we unlock.
343 */
344 if (start + len > i_size_read(inode))
345 i_size_write(inode, start + len);
346 out:
347 if (ret && space_reserved) {
348 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
349 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
350 }
351 *lenp = len;
352 return ret;
353 }
354
btrfs_dio_iomap_begin(struct inode * inode,loff_t start,loff_t length,unsigned int flags,struct iomap * iomap,struct iomap * srcmap)355 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
356 loff_t length, unsigned int flags, struct iomap *iomap,
357 struct iomap *srcmap)
358 {
359 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
360 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
361 struct extent_map *em;
362 struct extent_state *cached_state = NULL;
363 struct btrfs_dio_data *dio_data = iter->private;
364 u64 lockstart, lockend;
365 const bool write = !!(flags & IOMAP_WRITE);
366 int ret = 0;
367 u64 len = length;
368 const u64 data_alloc_len = length;
369 u32 unlock_bits = EXTENT_LOCKED;
370
371 /*
372 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
373 * we're NOWAIT we may submit a bio for a partial range and return
374 * EIOCBQUEUED, which would result in an errant short read.
375 *
376 * The best way to handle this would be to allow for partial completions
377 * of iocb's, so we could submit the partial bio, return and fault in
378 * the rest of the pages, and then submit the io for the rest of the
379 * range. However we don't have that currently, so simply return
380 * -EAGAIN at this point so that the normal path is used.
381 */
382 if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
383 return -EAGAIN;
384
385 /*
386 * Cap the size of reads to that usually seen in buffered I/O as we need
387 * to allocate a contiguous array for the checksums.
388 */
389 if (!write)
390 len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS);
391
392 lockstart = start;
393 lockend = start + len - 1;
394
395 /*
396 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
397 * enough if we've written compressed pages to this area, so we need to
398 * flush the dirty pages again to make absolutely sure that any
399 * outstanding dirty pages are on disk - the first flush only starts
400 * compression on the data, while keeping the pages locked, so by the
401 * time the second flush returns we know bios for the compressed pages
402 * were submitted and finished, and the pages no longer under writeback.
403 *
404 * If we have a NOWAIT request and we have any pages in the range that
405 * are locked, likely due to compression still in progress, we don't want
406 * to block on page locks. We also don't want to block on pages marked as
407 * dirty or under writeback (same as for the non-compression case).
408 * iomap_dio_rw() did the same check, but after that and before we got
409 * here, mmap'ed writes may have happened or buffered reads started
410 * (readpage() and readahead(), which lock pages), as we haven't locked
411 * the file range yet.
412 */
413 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
414 &BTRFS_I(inode)->runtime_flags)) {
415 if (flags & IOMAP_NOWAIT) {
416 if (filemap_range_needs_writeback(inode->i_mapping,
417 lockstart, lockend))
418 return -EAGAIN;
419 } else {
420 ret = filemap_fdatawrite_range(inode->i_mapping, start,
421 start + length - 1);
422 if (ret)
423 return ret;
424 }
425 }
426
427 memset(dio_data, 0, sizeof(*dio_data));
428
429 /*
430 * We always try to allocate data space and must do it before locking
431 * the file range, to avoid deadlocks with concurrent writes to the same
432 * range if the range has several extents and the writes don't expand the
433 * current i_size (the inode lock is taken in shared mode). If we fail to
434 * allocate data space here we continue and later, after locking the
435 * file range, we fail with ENOSPC only if we figure out we can not do a
436 * NOCOW write.
437 */
438 if (write && !(flags & IOMAP_NOWAIT)) {
439 ret = btrfs_check_data_free_space(BTRFS_I(inode),
440 &dio_data->data_reserved,
441 start, data_alloc_len, false);
442 if (!ret)
443 dio_data->data_space_reserved = true;
444 else if (!(BTRFS_I(inode)->flags &
445 (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
446 goto err;
447 }
448
449 /*
450 * If this errors out it's because we couldn't invalidate pagecache for
451 * this range and we need to fallback to buffered IO, or we are doing a
452 * NOWAIT read/write and we need to block.
453 */
454 ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
455 if (ret < 0)
456 goto err;
457
458 em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
459 if (IS_ERR(em)) {
460 ret = PTR_ERR(em);
461 goto unlock_err;
462 }
463
464 /*
465 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
466 * io. INLINE is special, and we could probably kludge it in here, but
467 * it's still buffered so for safety lets just fall back to the generic
468 * buffered path.
469 *
470 * For COMPRESSED we _have_ to read the entire extent in so we can
471 * decompress it, so there will be buffering required no matter what we
472 * do, so go ahead and fallback to buffered.
473 *
474 * We return -ENOTBLK because that's what makes DIO go ahead and go back
475 * to buffered IO. Don't blame me, this is the price we pay for using
476 * the generic code.
477 */
478 if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
479 btrfs_free_extent_map(em);
480 /*
481 * If we are in a NOWAIT context, return -EAGAIN in order to
482 * fallback to buffered IO. This is not only because we can
483 * block with buffered IO (no support for NOWAIT semantics at
484 * the moment) but also to avoid returning short reads to user
485 * space - this happens if we were able to read some data from
486 * previous non-compressed extents and then when we fallback to
487 * buffered IO, at btrfs_file_read_iter() by calling
488 * filemap_read(), we fail to fault in pages for the read buffer,
489 * in which case filemap_read() returns a short read (the number
490 * of bytes previously read is > 0, so it does not return -EFAULT).
491 */
492 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
493 goto unlock_err;
494 }
495
496 len = min(len, em->len - (start - em->start));
497
498 /*
499 * If we have a NOWAIT request and the range contains multiple extents
500 * (or a mix of extents and holes), then we return -EAGAIN to make the
501 * caller fallback to a context where it can do a blocking (without
502 * NOWAIT) request. This way we avoid doing partial IO and returning
503 * success to the caller, which is not optimal for writes and for reads
504 * it can result in unexpected behaviour for an application.
505 *
506 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
507 * iomap_dio_rw(), we can end up returning less data then what the caller
508 * asked for, resulting in an unexpected, and incorrect, short read.
509 * That is, the caller asked to read N bytes and we return less than that,
510 * which is wrong unless we are crossing EOF. This happens if we get a
511 * page fault error when trying to fault in pages for the buffer that is
512 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
513 * have previously submitted bios for other extents in the range, in
514 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
515 * those bios have completed by the time we get the page fault error,
516 * which we return back to our caller - we should only return EIOCBQUEUED
517 * after we have submitted bios for all the extents in the range.
518 */
519 if ((flags & IOMAP_NOWAIT) && len < length) {
520 btrfs_free_extent_map(em);
521 ret = -EAGAIN;
522 goto unlock_err;
523 }
524
525 if (write) {
526 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
527 start, &len, flags);
528 if (ret < 0)
529 goto unlock_err;
530 /* Recalc len in case the new em is smaller than requested */
531 len = min(len, em->len - (start - em->start));
532 if (dio_data->data_space_reserved) {
533 u64 release_offset;
534 u64 release_len = 0;
535
536 if (dio_data->nocow_done) {
537 release_offset = start;
538 release_len = data_alloc_len;
539 } else if (len < data_alloc_len) {
540 release_offset = start + len;
541 release_len = data_alloc_len - len;
542 }
543
544 if (release_len > 0)
545 btrfs_free_reserved_data_space(BTRFS_I(inode),
546 dio_data->data_reserved,
547 release_offset,
548 release_len);
549 }
550 }
551
552 /*
553 * Translate extent map information to iomap.
554 * We trim the extents (and move the addr) even though iomap code does
555 * that, since we have locked only the parts we are performing I/O in.
556 */
557 if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
558 ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
559 iomap->addr = IOMAP_NULL_ADDR;
560 iomap->type = IOMAP_HOLE;
561 } else {
562 iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
563 iomap->type = IOMAP_MAPPED;
564 }
565 iomap->offset = start;
566 iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
567 iomap->length = len;
568 btrfs_free_extent_map(em);
569
570 /*
571 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
572 * writes only hold it for this part. We hold the extent lock until
573 * we're completely done with the extent map to make sure it remains
574 * valid.
575 */
576 if (write)
577 unlock_bits |= EXTENT_DIO_LOCKED;
578
579 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
580 unlock_bits, &cached_state);
581
582 /* We didn't use everything, unlock the dio extent for the remainder. */
583 if (!write && (start + len) < lockend)
584 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
585 lockend, NULL);
586
587 return 0;
588
589 unlock_err:
590 /*
591 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
592 * to update this, be explicit that we expect EXTENT_LOCKED and
593 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
594 */
595 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
596 EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
597 err:
598 if (dio_data->data_space_reserved) {
599 btrfs_free_reserved_data_space(BTRFS_I(inode),
600 dio_data->data_reserved,
601 start, data_alloc_len);
602 extent_changeset_free(dio_data->data_reserved);
603 }
604
605 return ret;
606 }
607
btrfs_dio_iomap_end(struct inode * inode,loff_t pos,loff_t length,ssize_t written,unsigned int flags,struct iomap * iomap)608 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
609 ssize_t written, unsigned int flags, struct iomap *iomap)
610 {
611 struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
612 struct btrfs_dio_data *dio_data = iter->private;
613 size_t submitted = dio_data->submitted;
614 const bool write = !!(flags & IOMAP_WRITE);
615 int ret = 0;
616
617 if (!write && (iomap->type == IOMAP_HOLE)) {
618 /* If reading from a hole, unlock and return */
619 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
620 pos + length - 1, NULL);
621 return 0;
622 }
623
624 if (submitted < length) {
625 pos += submitted;
626 length -= submitted;
627 if (write)
628 btrfs_finish_ordered_extent(dio_data->ordered, NULL,
629 pos, length, false);
630 else
631 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
632 pos + length - 1, NULL);
633 ret = -ENOTBLK;
634 }
635 if (write) {
636 btrfs_put_ordered_extent(dio_data->ordered);
637 dio_data->ordered = NULL;
638 }
639
640 if (write)
641 extent_changeset_free(dio_data->data_reserved);
642 return ret;
643 }
644
btrfs_dio_end_io(struct btrfs_bio * bbio)645 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
646 {
647 struct btrfs_dio_private *dip =
648 container_of(bbio, struct btrfs_dio_private, bbio);
649 struct btrfs_inode *inode = bbio->inode;
650 struct bio *bio = &bbio->bio;
651
652 if (bio->bi_status) {
653 btrfs_warn(inode->root->fs_info,
654 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
655 btrfs_ino(inode), bio->bi_opf,
656 dip->file_offset, dip->bytes, bio->bi_status);
657 }
658
659 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
660 btrfs_finish_ordered_extent(bbio->ordered, NULL,
661 dip->file_offset, dip->bytes,
662 !bio->bi_status);
663 } else {
664 btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
665 dip->file_offset + dip->bytes - 1, NULL);
666 }
667
668 bbio->bio.bi_private = bbio->private;
669 iomap_dio_bio_end_io(bio);
670 }
671
btrfs_extract_ordered_extent(struct btrfs_bio * bbio,struct btrfs_ordered_extent * ordered)672 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
673 struct btrfs_ordered_extent *ordered)
674 {
675 u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
676 u64 len = bbio->bio.bi_iter.bi_size;
677 struct btrfs_ordered_extent *new;
678 int ret;
679
680 /* Must always be called for the beginning of an ordered extent. */
681 if (WARN_ON_ONCE(start != ordered->disk_bytenr))
682 return -EINVAL;
683
684 /* No need to split if the ordered extent covers the entire bio. */
685 if (ordered->disk_num_bytes == len) {
686 refcount_inc(&ordered->refs);
687 bbio->ordered = ordered;
688 return 0;
689 }
690
691 /*
692 * Don't split the extent_map for NOCOW extents, as we're writing into
693 * a pre-existing one.
694 */
695 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
696 ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
697 ordered->num_bytes, len,
698 ordered->disk_bytenr);
699 if (ret)
700 return ret;
701 }
702
703 new = btrfs_split_ordered_extent(ordered, len);
704 if (IS_ERR(new))
705 return PTR_ERR(new);
706 bbio->ordered = new;
707 return 0;
708 }
709
btrfs_dio_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)710 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
711 loff_t file_offset)
712 {
713 struct btrfs_bio *bbio = btrfs_bio(bio);
714 struct btrfs_dio_private *dip =
715 container_of(bbio, struct btrfs_dio_private, bbio);
716 struct btrfs_dio_data *dio_data = iter->private;
717
718 btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset,
719 btrfs_dio_end_io, bio->bi_private);
720
721 dip->file_offset = file_offset;
722 dip->bytes = bio->bi_iter.bi_size;
723
724 dio_data->submitted += bio->bi_iter.bi_size;
725
726 /*
727 * Check if we are doing a partial write. If we are, we need to split
728 * the ordered extent to match the submitted bio. Hang on to the
729 * remaining unfinishable ordered_extent in dio_data so that it can be
730 * cancelled in iomap_end to avoid a deadlock wherein faulting the
731 * remaining pages is blocked on the outstanding ordered extent.
732 */
733 if (iter->flags & IOMAP_WRITE) {
734 int ret;
735
736 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737 if (ret) {
738 btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739 file_offset, dip->bytes,
740 !ret);
741 bio->bi_status = errno_to_blk_status(ret);
742 iomap_dio_bio_end_io(bio);
743 return;
744 }
745 }
746
747 btrfs_submit_bbio(bbio, 0);
748 }
749
750 static const struct iomap_ops btrfs_dio_iomap_ops = {
751 .iomap_begin = btrfs_dio_iomap_begin,
752 .iomap_end = btrfs_dio_iomap_end,
753 };
754
755 static const struct iomap_dio_ops btrfs_dio_ops = {
756 .submit_io = btrfs_dio_submit_io,
757 .bio_set = &btrfs_dio_bioset,
758 };
759
btrfs_dio_read(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)760 static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761 size_t done_before)
762 {
763 struct btrfs_dio_data data = { 0 };
764
765 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766 IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
767 }
768
btrfs_dio_write(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)769 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770 size_t done_before)
771 {
772 struct btrfs_dio_data data = { 0 };
773
774 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775 IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before);
776 }
777
check_direct_IO(struct btrfs_fs_info * fs_info,const struct iov_iter * iter,loff_t offset)778 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779 const struct iov_iter *iter, loff_t offset)
780 {
781 const u32 blocksize_mask = fs_info->sectorsize - 1;
782
783 if (offset & blocksize_mask)
784 return -EINVAL;
785
786 if (iov_iter_alignment(iter) & blocksize_mask)
787 return -EINVAL;
788 return 0;
789 }
790
btrfs_direct_write(struct kiocb * iocb,struct iov_iter * from)791 ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
792 {
793 struct file *file = iocb->ki_filp;
794 struct inode *inode = file_inode(file);
795 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
796 loff_t pos;
797 ssize_t written = 0;
798 ssize_t written_buffered;
799 size_t prev_left = 0;
800 loff_t endbyte;
801 ssize_t ret;
802 unsigned int ilock_flags = 0;
803 struct iomap_dio *dio;
804 const u64 data_profile = btrfs_data_alloc_profile(fs_info) &
805 BTRFS_BLOCK_GROUP_PROFILE_MASK;
806
807 if (iocb->ki_flags & IOCB_NOWAIT)
808 ilock_flags |= BTRFS_ILOCK_TRY;
809
810 /*
811 * If the write DIO is within EOF, use a shared lock and also only if
812 * security bits will likely not be dropped by file_remove_privs() called
813 * from btrfs_write_check(). Either will need to be rechecked after the
814 * lock was acquired.
815 */
816 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
817 ilock_flags |= BTRFS_ILOCK_SHARED;
818
819 /*
820 * If our data profile has duplication (either extra mirrors or RAID56),
821 * we can not trust the direct IO buffer, the content may change during
822 * writeback and cause different contents written to different mirrors.
823 *
824 * Thus only RAID0 and SINGLE can go true zero-copy direct IO.
825 */
826 if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0)
827 goto buffered;
828
829 relock:
830 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
831 if (ret < 0)
832 return ret;
833
834 /* Shared lock cannot be used with security bits set. */
835 if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
836 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
837 ilock_flags &= ~BTRFS_ILOCK_SHARED;
838 goto relock;
839 }
840
841 ret = generic_write_checks(iocb, from);
842 if (ret <= 0) {
843 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
844 return ret;
845 }
846
847 ret = btrfs_write_check(iocb, ret);
848 if (ret < 0) {
849 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
850 goto out;
851 }
852
853 pos = iocb->ki_pos;
854 /*
855 * Re-check since file size may have changed just before taking the
856 * lock or pos may have changed because of O_APPEND in generic_write_check()
857 */
858 if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
859 pos + iov_iter_count(from) > i_size_read(inode)) {
860 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
861 ilock_flags &= ~BTRFS_ILOCK_SHARED;
862 goto relock;
863 }
864
865 if (check_direct_IO(fs_info, from, pos)) {
866 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
867 goto buffered;
868 }
869 /*
870 * We can't control the folios being passed in, applications can write
871 * to them while a direct IO write is in progress. This means the
872 * content might change after we calculated the data checksum.
873 * Therefore we can end up storing a checksum that doesn't match the
874 * persisted data.
875 *
876 * To be extra safe and avoid false data checksum mismatch, if the
877 * inode requires data checksum, just fallback to buffered IO.
878 * For buffered IO we have full control of page cache and can ensure
879 * no one is modifying the content during writeback.
880 */
881 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
882 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
883 goto buffered;
884 }
885
886 /*
887 * The iov_iter can be mapped to the same file range we are writing to.
888 * If that's the case, then we will deadlock in the iomap code, because
889 * it first calls our callback btrfs_dio_iomap_begin(), which will create
890 * an ordered extent, and after that it will fault in the pages that the
891 * iov_iter refers to. During the fault in we end up in the readahead
892 * pages code (starting at btrfs_readahead()), which will lock the range,
893 * find that ordered extent and then wait for it to complete (at
894 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
895 * obviously the ordered extent can never complete as we didn't submit
896 * yet the respective bio(s). This always happens when the buffer is
897 * memory mapped to the same file range, since the iomap DIO code always
898 * invalidates pages in the target file range (after starting and waiting
899 * for any writeback).
900 *
901 * So here we disable page faults in the iov_iter and then retry if we
902 * got -EFAULT, faulting in the pages before the retry.
903 */
904 again:
905 from->nofault = true;
906 dio = btrfs_dio_write(iocb, from, written);
907 from->nofault = false;
908
909 if (IS_ERR_OR_NULL(dio)) {
910 ret = PTR_ERR_OR_ZERO(dio);
911 } else {
912 /*
913 * If we have a synchronous write, we must make sure the fsync
914 * triggered by the iomap_dio_complete() call below doesn't
915 * deadlock on the inode lock - we are already holding it and we
916 * can't call it after unlocking because we may need to complete
917 * partial writes due to the input buffer (or parts of it) not
918 * being already faulted in.
919 */
920 ASSERT(current->journal_info == NULL);
921 current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
922 ret = iomap_dio_complete(dio);
923 current->journal_info = NULL;
924 }
925
926 /* No increment (+=) because iomap returns a cumulative value. */
927 if (ret > 0)
928 written = ret;
929
930 if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
931 const size_t left = iov_iter_count(from);
932 /*
933 * We have more data left to write. Try to fault in as many as
934 * possible of the remainder pages and retry. We do this without
935 * releasing and locking again the inode, to prevent races with
936 * truncate.
937 *
938 * Also, in case the iov refers to pages in the file range of the
939 * file we want to write to (due to a mmap), we could enter an
940 * infinite loop if we retry after faulting the pages in, since
941 * iomap will invalidate any pages in the range early on, before
942 * it tries to fault in the pages of the iov. So we keep track of
943 * how much was left of iov in the previous EFAULT and fallback
944 * to buffered IO in case we haven't made any progress.
945 */
946 if (left == prev_left) {
947 ret = -ENOTBLK;
948 } else {
949 fault_in_iov_iter_readable(from, left);
950 prev_left = left;
951 goto again;
952 }
953 }
954
955 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
956
957 /*
958 * If 'ret' is -ENOTBLK or we have not written all data, then it means
959 * we must fallback to buffered IO.
960 */
961 if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
962 goto out;
963
964 buffered:
965 /*
966 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
967 * it must retry the operation in a context where blocking is acceptable,
968 * because even if we end up not blocking during the buffered IO attempt
969 * below, we will block when flushing and waiting for the IO.
970 */
971 if (iocb->ki_flags & IOCB_NOWAIT) {
972 ret = -EAGAIN;
973 goto out;
974 }
975
976 pos = iocb->ki_pos;
977 written_buffered = btrfs_buffered_write(iocb, from);
978 if (written_buffered < 0) {
979 ret = written_buffered;
980 goto out;
981 }
982 /*
983 * Ensure all data is persisted. We want the next direct IO read to be
984 * able to read what was just written.
985 */
986 endbyte = pos + written_buffered - 1;
987 ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
988 if (ret)
989 goto out;
990 ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
991 if (ret)
992 goto out;
993 written += written_buffered;
994 iocb->ki_pos = pos + written_buffered;
995 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
996 endbyte >> PAGE_SHIFT);
997 out:
998 return ret < 0 ? ret : written;
999 }
1000
check_direct_read(struct btrfs_fs_info * fs_info,const struct iov_iter * iter,loff_t offset)1001 static int check_direct_read(struct btrfs_fs_info *fs_info,
1002 const struct iov_iter *iter, loff_t offset)
1003 {
1004 int ret;
1005 int i, seg;
1006
1007 ret = check_direct_IO(fs_info, iter, offset);
1008 if (ret < 0)
1009 return ret;
1010
1011 if (!iter_is_iovec(iter))
1012 return 0;
1013
1014 for (seg = 0; seg < iter->nr_segs; seg++) {
1015 for (i = seg + 1; i < iter->nr_segs; i++) {
1016 const struct iovec *iov1 = iter_iov(iter) + seg;
1017 const struct iovec *iov2 = iter_iov(iter) + i;
1018
1019 if (iov1->iov_base == iov2->iov_base)
1020 return -EINVAL;
1021 }
1022 }
1023 return 0;
1024 }
1025
btrfs_direct_read(struct kiocb * iocb,struct iov_iter * to)1026 ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1027 {
1028 struct inode *inode = file_inode(iocb->ki_filp);
1029 size_t prev_left = 0;
1030 ssize_t read = 0;
1031 ssize_t ret;
1032
1033 if (fsverity_active(inode))
1034 return 0;
1035
1036 if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1037 return 0;
1038
1039 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1040 again:
1041 /*
1042 * This is similar to what we do for direct IO writes, see the comment
1043 * at btrfs_direct_write(), but we also disable page faults in addition
1044 * to disabling them only at the iov_iter level. This is because when
1045 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1046 * which can still trigger page fault ins despite having set ->nofault
1047 * to true of our 'to' iov_iter.
1048 *
1049 * The difference to direct IO writes is that we deadlock when trying
1050 * to lock the extent range in the inode's tree during he page reads
1051 * triggered by the fault in (while for writes it is due to waiting for
1052 * our own ordered extent). This is because for direct IO reads,
1053 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1054 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1055 */
1056 pagefault_disable();
1057 to->nofault = true;
1058 ret = btrfs_dio_read(iocb, to, read);
1059 to->nofault = false;
1060 pagefault_enable();
1061
1062 /* No increment (+=) because iomap returns a cumulative value. */
1063 if (ret > 0)
1064 read = ret;
1065
1066 if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1067 const size_t left = iov_iter_count(to);
1068
1069 if (left == prev_left) {
1070 /*
1071 * We didn't make any progress since the last attempt,
1072 * fallback to a buffered read for the remainder of the
1073 * range. This is just to avoid any possibility of looping
1074 * for too long.
1075 */
1076 ret = read;
1077 } else {
1078 /*
1079 * We made some progress since the last retry or this is
1080 * the first time we are retrying. Fault in as many pages
1081 * as possible and retry.
1082 */
1083 fault_in_iov_iter_writeable(to, left);
1084 prev_left = left;
1085 goto again;
1086 }
1087 }
1088 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1089 return ret < 0 ? ret : read;
1090 }
1091
btrfs_init_dio(void)1092 int __init btrfs_init_dio(void)
1093 {
1094 if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1095 offsetof(struct btrfs_dio_private, bbio.bio),
1096 BIOSET_NEED_BVECS))
1097 return -ENOMEM;
1098
1099 return 0;
1100 }
1101
btrfs_destroy_dio(void)1102 void __cold btrfs_destroy_dio(void)
1103 {
1104 bioset_exit(&btrfs_dio_bioset);
1105 }
1106