1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 * Copyright (C) 2022 Christoph Hellwig.
5 */
6
7 #include <linux/blk_types.h>
8 #include <linux/bio.h>
9 #include "bio.h"
10 #include "ctree.h"
11 #include "volumes.h"
12 #include "raid56.h"
13 #include "async-thread.h"
14 #include "dev-replace.h"
15 #include "zoned.h"
16 #include "file-item.h"
17 #include "raid-stripe-tree.h"
18
19 static struct bio_set btrfs_bioset;
20 static struct bio_set btrfs_clone_bioset;
21 static struct bio_set btrfs_repair_bioset;
22 static mempool_t btrfs_failed_bio_pool;
23
24 struct btrfs_failed_bio {
25 struct btrfs_bio *bbio;
26 int num_copies;
27 atomic_t repair_count;
28 };
29
30 /* Is this a data path I/O that needs storage layer checksum and repair? */
is_data_bbio(const struct btrfs_bio * bbio)31 static inline bool is_data_bbio(const struct btrfs_bio *bbio)
32 {
33 return bbio->inode && is_data_inode(bbio->inode);
34 }
35
bbio_has_ordered_extent(const struct btrfs_bio * bbio)36 static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
37 {
38 return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
39 }
40
41 /*
42 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it
43 * is already initialized by the block layer.
44 */
btrfs_bio_init(struct btrfs_bio * bbio,struct btrfs_inode * inode,u64 file_offset,btrfs_bio_end_io_t end_io,void * private)45 void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset,
46 btrfs_bio_end_io_t end_io, void *private)
47 {
48 /* @inode parameter is mandatory. */
49 ASSERT(inode);
50
51 memset(bbio, 0, offsetof(struct btrfs_bio, bio));
52 bbio->inode = inode;
53 bbio->end_io = end_io;
54 bbio->private = private;
55 bbio->file_offset = file_offset;
56 atomic_set(&bbio->pending_ios, 1);
57 WRITE_ONCE(bbio->status, BLK_STS_OK);
58 }
59
60 /*
61 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
62 * btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
63 *
64 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
65 * a mempool.
66 */
btrfs_bio_alloc(unsigned int nr_vecs,blk_opf_t opf,struct btrfs_inode * inode,u64 file_offset,btrfs_bio_end_io_t end_io,void * private)67 struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
68 struct btrfs_inode *inode, u64 file_offset,
69 btrfs_bio_end_io_t end_io, void *private)
70 {
71 struct btrfs_bio *bbio;
72 struct bio *bio;
73
74 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
75 bbio = btrfs_bio(bio);
76 btrfs_bio_init(bbio, inode, file_offset, end_io, private);
77 return bbio;
78 }
79
btrfs_split_bio(struct btrfs_fs_info * fs_info,struct btrfs_bio * orig_bbio,u64 map_length)80 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
81 struct btrfs_bio *orig_bbio,
82 u64 map_length)
83 {
84 struct btrfs_bio *bbio;
85 struct bio *bio;
86
87 bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
88 &btrfs_clone_bioset);
89 if (IS_ERR(bio))
90 return ERR_CAST(bio);
91
92 bbio = btrfs_bio(bio);
93 btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio);
94 orig_bbio->file_offset += map_length;
95 if (bbio_has_ordered_extent(bbio)) {
96 refcount_inc(&orig_bbio->ordered->refs);
97 bbio->ordered = orig_bbio->ordered;
98 bbio->orig_logical = orig_bbio->orig_logical;
99 orig_bbio->orig_logical += map_length;
100 }
101
102 bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
103 bbio->can_use_append = orig_bbio->can_use_append;
104 bbio->is_scrub = orig_bbio->is_scrub;
105 bbio->is_remap = orig_bbio->is_remap;
106 bbio->async_csum = orig_bbio->async_csum;
107
108 atomic_inc(&orig_bbio->pending_ios);
109 return bbio;
110 }
111
btrfs_bio_end_io(struct btrfs_bio * bbio,blk_status_t status)112 void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
113 {
114 /* Make sure we're already in task context. */
115 ASSERT(in_task());
116
117 if (bbio->async_csum)
118 wait_for_completion(&bbio->csum_done);
119
120 bbio->bio.bi_status = status;
121 if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
122 struct btrfs_bio *orig_bbio = bbio->private;
123
124 /* Free bio that was never submitted to the underlying device. */
125 if (bbio_has_ordered_extent(bbio))
126 btrfs_put_ordered_extent(bbio->ordered);
127 bio_put(&bbio->bio);
128
129 bbio = orig_bbio;
130 }
131
132 /*
133 * At this point, bbio always points to the original btrfs_bio. Save
134 * the first error in it.
135 */
136 if (status != BLK_STS_OK)
137 cmpxchg(&bbio->status, BLK_STS_OK, status);
138
139 if (atomic_dec_and_test(&bbio->pending_ios)) {
140 /* Load split bio's error which might be set above. */
141 if (status == BLK_STS_OK)
142 bbio->bio.bi_status = READ_ONCE(bbio->status);
143
144 if (bbio_has_ordered_extent(bbio)) {
145 struct btrfs_ordered_extent *ordered = bbio->ordered;
146
147 bbio->end_io(bbio);
148 btrfs_put_ordered_extent(ordered);
149 } else {
150 bbio->end_io(bbio);
151 }
152 }
153 }
154
next_repair_mirror(const struct btrfs_failed_bio * fbio,int cur_mirror)155 static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
156 {
157 if (cur_mirror == fbio->num_copies)
158 return cur_mirror + 1 - fbio->num_copies;
159 return cur_mirror + 1;
160 }
161
prev_repair_mirror(const struct btrfs_failed_bio * fbio,int cur_mirror)162 static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
163 {
164 if (cur_mirror == 1)
165 return fbio->num_copies;
166 return cur_mirror - 1;
167 }
168
btrfs_repair_done(struct btrfs_failed_bio * fbio)169 static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
170 {
171 if (atomic_dec_and_test(&fbio->repair_count)) {
172 btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
173 mempool_free(fbio, &btrfs_failed_bio_pool);
174 }
175 }
176
btrfs_end_repair_bio(struct btrfs_bio * repair_bbio,struct btrfs_device * dev)177 static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
178 struct btrfs_device *dev)
179 {
180 struct btrfs_failed_bio *fbio = repair_bbio->private;
181 struct btrfs_inode *inode = repair_bbio->inode;
182 struct btrfs_fs_info *fs_info = inode->root->fs_info;
183 /*
184 * We can not move forward the saved_iter, as it will be later
185 * utilized by repair_bbio again.
186 */
187 struct bvec_iter saved_iter = repair_bbio->saved_iter;
188 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
189 const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
190 const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
191 int mirror = repair_bbio->mirror_num;
192 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
193 phys_addr_t paddr;
194 unsigned int slot = 0;
195
196 /* Repair bbio should be eaxctly one block sized. */
197 ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
198
199 btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
200 ASSERT(slot < nr_steps);
201 paddrs[slot] = paddr;
202 slot++;
203 }
204
205 if (repair_bbio->bio.bi_status ||
206 !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) {
207 bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
208 repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
209
210 mirror = next_repair_mirror(fbio, mirror);
211 if (mirror == fbio->bbio->mirror_num) {
212 btrfs_debug(fs_info, "no mirror left");
213 fbio->bbio->bio.bi_status = BLK_STS_IOERR;
214 goto done;
215 }
216
217 btrfs_submit_bbio(repair_bbio, mirror);
218 return;
219 }
220
221 do {
222 mirror = prev_repair_mirror(fbio, mirror);
223 btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
224 repair_bbio->file_offset, fs_info->sectorsize,
225 logical, paddrs, step, mirror);
226 } while (mirror != fbio->bbio->mirror_num);
227
228 done:
229 btrfs_repair_done(fbio);
230 bio_put(&repair_bbio->bio);
231 }
232
233 /*
234 * Try to kick off a repair read to the next available mirror for a bad sector.
235 *
236 * This primarily tries to recover good data to serve the actual read request,
237 * but also tries to write the good data back to the bad mirror(s) when a
238 * read succeeded to restore the redundancy.
239 */
repair_one_sector(struct btrfs_bio * failed_bbio,u32 bio_offset,phys_addr_t paddrs[],struct btrfs_failed_bio * fbio)240 static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
241 u32 bio_offset,
242 phys_addr_t paddrs[],
243 struct btrfs_failed_bio *fbio)
244 {
245 struct btrfs_inode *inode = failed_bbio->inode;
246 struct btrfs_fs_info *fs_info = inode->root->fs_info;
247 const u32 sectorsize = fs_info->sectorsize;
248 const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
249 const u32 nr_steps = sectorsize / step;
250 /*
251 * For bs > ps cases, the saved_iter can be partially moved forward.
252 * In that case we should round it down to the block boundary.
253 */
254 const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
255 sectorsize);
256 struct btrfs_bio *repair_bbio;
257 struct bio *repair_bio;
258 int num_copies;
259 int mirror;
260
261 btrfs_debug(fs_info, "repair read error: read error at %llu",
262 failed_bbio->file_offset + bio_offset);
263
264 num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
265 if (num_copies == 1) {
266 btrfs_debug(fs_info, "no copy to repair from");
267 failed_bbio->bio.bi_status = BLK_STS_IOERR;
268 return fbio;
269 }
270
271 if (!fbio) {
272 fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
273 fbio->bbio = failed_bbio;
274 fbio->num_copies = num_copies;
275 atomic_set(&fbio->repair_count, 1);
276 }
277
278 atomic_inc(&fbio->repair_count);
279
280 repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS,
281 &btrfs_repair_bioset);
282 repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT;
283 for (int i = 0; i < nr_steps; i++) {
284 int ret;
285
286 ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE);
287
288 ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step,
289 offset_in_page(paddrs[i]));
290 ASSERT(ret == step);
291 }
292
293 repair_bbio = btrfs_bio(repair_bio);
294 btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset,
295 NULL, fbio);
296
297 mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
298 btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
299 btrfs_submit_bbio(repair_bbio, mirror);
300 return fbio;
301 }
302
btrfs_check_read_bio(struct btrfs_bio * bbio,struct btrfs_device * dev)303 static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
304 {
305 struct btrfs_inode *inode = bbio->inode;
306 struct btrfs_fs_info *fs_info = inode->root->fs_info;
307 const u32 sectorsize = fs_info->sectorsize;
308 const u32 step = min(sectorsize, PAGE_SIZE);
309 const u32 nr_steps = sectorsize / step;
310 struct bvec_iter *iter = &bbio->saved_iter;
311 blk_status_t status = bbio->bio.bi_status;
312 struct btrfs_failed_bio *fbio = NULL;
313 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
314 phys_addr_t paddr;
315 u32 offset = 0;
316
317 /* Read-repair requires the inode field to be set by the submitter. */
318 ASSERT(inode);
319
320 /*
321 * Hand off repair bios to the repair code as there is no upper level
322 * submitter for them.
323 */
324 if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
325 btrfs_end_repair_bio(bbio, dev);
326 return;
327 }
328
329 /* Clear the I/O error. A failed repair will reset it. */
330 bbio->bio.bi_status = BLK_STS_OK;
331
332 btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) {
333 paddrs[(offset / step) % nr_steps] = paddr;
334 offset += step;
335
336 if (IS_ALIGNED(offset, sectorsize)) {
337 if (status ||
338 !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs))
339 fbio = repair_one_sector(bbio, offset - sectorsize,
340 paddrs, fbio);
341 }
342 }
343 if (bbio->csum != bbio->csum_inline)
344 kvfree(bbio->csum);
345
346 if (fbio)
347 btrfs_repair_done(fbio);
348 else
349 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
350 }
351
btrfs_log_dev_io_error(const struct bio * bio,struct btrfs_device * dev)352 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
353 {
354 blk_status_t sts = bio->bi_status;
355
356 if (!dev || !dev->bdev)
357 return;
358 if (unlikely(sts == BLK_STS_OK))
359 return;
360 if (unlikely(sts != BLK_STS_IOERR && sts != BLK_STS_TARGET &&
361 sts != BLK_STS_MEDIUM && sts != BLK_STS_PROTECTION)) {
362 btrfs_warn_rl(dev->fs_info, "bdev %s unexpected block io error: %d",
363 btrfs_dev_name(dev), sts);
364 return;
365 }
366 if (btrfs_op(bio) == BTRFS_MAP_WRITE)
367 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
368 else if (!(bio->bi_opf & REQ_RAHEAD))
369 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
370 if (bio->bi_opf & REQ_PREFLUSH)
371 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
372 }
373
btrfs_end_io_wq(const struct btrfs_fs_info * fs_info,const struct bio * bio)374 static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
375 const struct bio *bio)
376 {
377 if (bio->bi_opf & REQ_META)
378 return fs_info->endio_meta_workers;
379 return fs_info->endio_workers;
380 }
381
simple_end_io_work(struct work_struct * work)382 static void simple_end_io_work(struct work_struct *work)
383 {
384 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
385 struct bio *bio = &bbio->bio;
386
387 if (bio_op(bio) == REQ_OP_READ) {
388 /* Metadata reads are checked and repaired by the submitter. */
389 if (is_data_bbio(bbio))
390 return btrfs_check_read_bio(bbio, bbio->bio.bi_private);
391 return btrfs_bio_end_io(bbio, bbio->bio.bi_status);
392 }
393 if (bio_is_zone_append(bio) && !bio->bi_status)
394 btrfs_record_physical_zoned(bbio);
395 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
396 }
397
btrfs_simple_end_io(struct bio * bio)398 static void btrfs_simple_end_io(struct bio *bio)
399 {
400 struct btrfs_bio *bbio = btrfs_bio(bio);
401 struct btrfs_device *dev = bio->bi_private;
402 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
403
404 btrfs_bio_counter_dec(fs_info);
405
406 if (bio->bi_status)
407 btrfs_log_dev_io_error(bio, dev);
408
409 INIT_WORK(&bbio->end_io_work, simple_end_io_work);
410 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
411 }
412
btrfs_raid56_end_io(struct bio * bio)413 static void btrfs_raid56_end_io(struct bio *bio)
414 {
415 struct btrfs_io_context *bioc = bio->bi_private;
416 struct btrfs_bio *bbio = btrfs_bio(bio);
417
418 /* RAID56 endio is always handled in workqueue. */
419 ASSERT(in_task());
420
421 btrfs_bio_counter_dec(bioc->fs_info);
422 bbio->mirror_num = bioc->mirror_num;
423 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
424 btrfs_check_read_bio(bbio, NULL);
425 else
426 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
427
428 btrfs_put_bioc(bioc);
429 }
430
orig_write_end_io_work(struct work_struct * work)431 static void orig_write_end_io_work(struct work_struct *work)
432 {
433 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
434 struct bio *bio = &bbio->bio;
435 struct btrfs_io_stripe *stripe = bio->bi_private;
436 struct btrfs_io_context *bioc = stripe->bioc;
437
438 btrfs_bio_counter_dec(bioc->fs_info);
439
440 if (bio->bi_status) {
441 atomic_inc(&bioc->error);
442 btrfs_log_dev_io_error(bio, stripe->dev);
443 }
444
445 /*
446 * Only send an error to the higher layers if it is beyond the tolerance
447 * threshold.
448 */
449 if (atomic_read(&bioc->error) > bioc->max_errors)
450 bio->bi_status = BLK_STS_IOERR;
451 else
452 bio->bi_status = BLK_STS_OK;
453
454 if (bio_is_zone_append(bio) && !bio->bi_status)
455 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
456
457 btrfs_bio_end_io(bbio, bbio->bio.bi_status);
458 btrfs_put_bioc(bioc);
459 }
460
btrfs_orig_write_end_io(struct bio * bio)461 static void btrfs_orig_write_end_io(struct bio *bio)
462 {
463 struct btrfs_bio *bbio = btrfs_bio(bio);
464
465 INIT_WORK(&bbio->end_io_work, orig_write_end_io_work);
466 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
467 }
468
clone_write_end_io_work(struct work_struct * work)469 static void clone_write_end_io_work(struct work_struct *work)
470 {
471 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
472 struct bio *bio = &bbio->bio;
473 struct btrfs_io_stripe *stripe = bio->bi_private;
474
475 if (bio->bi_status) {
476 atomic_inc(&stripe->bioc->error);
477 btrfs_log_dev_io_error(bio, stripe->dev);
478 } else if (bio_is_zone_append(bio)) {
479 stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
480 }
481
482 /* Pass on control to the original bio this one was cloned from */
483 bio_endio(stripe->bioc->orig_bio);
484 bio_put(bio);
485 }
486
btrfs_clone_write_end_io(struct bio * bio)487 static void btrfs_clone_write_end_io(struct bio *bio)
488 {
489 struct btrfs_bio *bbio = btrfs_bio(bio);
490
491 INIT_WORK(&bbio->end_io_work, clone_write_end_io_work);
492 queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work);
493 }
494
btrfs_submit_dev_bio(struct btrfs_device * dev,struct bio * bio)495 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
496 {
497 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
498
499 if (!dev || !dev->bdev ||
500 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
501 (btrfs_op(bio) == BTRFS_MAP_WRITE &&
502 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
503 bio_io_error(bio);
504 return;
505 }
506
507 bio_set_dev(bio, dev->bdev);
508
509 /*
510 * For zone append writing, bi_sector must point the beginning of the
511 * zone
512 */
513 if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) {
514 u64 zone_start = round_down(physical, dev->fs_info->zone_size);
515
516 ASSERT(btrfs_dev_is_sequential(dev, physical));
517 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
518 bio->bi_opf &= ~REQ_OP_WRITE;
519 bio->bi_opf |= REQ_OP_ZONE_APPEND;
520 }
521 btrfs_debug(dev->fs_info,
522 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
523 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
524 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
525 dev->devid, bio->bi_iter.bi_size);
526
527 /*
528 * Track reads if tracking is enabled; ignore I/O operations before the
529 * filesystem is fully initialized.
530 */
531 if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
532 percpu_counter_add(&dev->fs_info->stats_read_blocks,
533 bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
534
535 if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
536 blkcg_punt_bio_submit(bio);
537 else
538 submit_bio(bio);
539 }
540
btrfs_submit_mirrored_bio(struct btrfs_io_context * bioc,int dev_nr)541 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
542 {
543 struct bio *orig_bio = bioc->orig_bio, *bio;
544 struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio);
545
546 ASSERT(bio_op(orig_bio) != REQ_OP_READ);
547
548 /* Reuse the bio embedded into the btrfs_bio for the last mirror */
549 if (dev_nr == bioc->num_stripes - 1) {
550 bio = orig_bio;
551 bio->bi_end_io = btrfs_orig_write_end_io;
552 } else {
553 /* We need to use endio_work to run end_io in task context. */
554 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset);
555 bio_inc_remaining(orig_bio);
556 btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode,
557 orig_bbio->file_offset, NULL, NULL);
558 bio->bi_end_io = btrfs_clone_write_end_io;
559 }
560
561 bio->bi_private = &bioc->stripes[dev_nr];
562 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
563 bioc->stripes[dev_nr].bioc = bioc;
564 bioc->size = bio->bi_iter.bi_size;
565 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
566 }
567
btrfs_submit_bio(struct bio * bio,struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,int mirror_num)568 static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
569 struct btrfs_io_stripe *smap, int mirror_num)
570 {
571 if (!bioc) {
572 /* Single mirror read/write fast path. */
573 btrfs_bio(bio)->mirror_num = mirror_num;
574 bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
575 if (bio_op(bio) != REQ_OP_READ)
576 btrfs_bio(bio)->orig_physical = smap->physical;
577 bio->bi_private = smap->dev;
578 bio->bi_end_io = btrfs_simple_end_io;
579 btrfs_submit_dev_bio(smap->dev, bio);
580 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
581 /* Parity RAID write or read recovery. */
582 bio->bi_private = bioc;
583 bio->bi_end_io = btrfs_raid56_end_io;
584 if (bio_op(bio) == REQ_OP_READ)
585 raid56_parity_recover(bio, bioc, mirror_num);
586 else
587 raid56_parity_write(bio, bioc);
588 } else {
589 /* Write to multiple mirrors. */
590 int total_devs = bioc->num_stripes;
591
592 bioc->orig_bio = bio;
593 for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
594 btrfs_submit_mirrored_bio(bioc, dev_nr);
595 }
596 }
597
btrfs_bio_csum(struct btrfs_bio * bbio)598 static int btrfs_bio_csum(struct btrfs_bio *bbio)
599 {
600 if (bbio->bio.bi_opf & REQ_META)
601 return btree_csum_one_bio(bbio);
602 #ifdef CONFIG_BTRFS_EXPERIMENTAL
603 return btrfs_csum_one_bio(bbio, true);
604 #else
605 return btrfs_csum_one_bio(bbio, false);
606 #endif
607 }
608
609 /*
610 * Async submit bios are used to offload expensive checksumming onto the worker
611 * threads.
612 */
613 struct async_submit_bio {
614 struct btrfs_bio *bbio;
615 struct btrfs_io_context *bioc;
616 struct btrfs_io_stripe smap;
617 int mirror_num;
618 struct btrfs_work work;
619 };
620
621 /*
622 * In order to insert checksums into the metadata in large chunks, we wait
623 * until bio submission time. All the pages in the bio are checksummed and
624 * sums are attached onto the ordered extent record.
625 *
626 * At IO completion time the csums attached on the ordered extent record are
627 * inserted into the btree.
628 */
run_one_async_start(struct btrfs_work * work)629 static void run_one_async_start(struct btrfs_work *work)
630 {
631 struct async_submit_bio *async =
632 container_of(work, struct async_submit_bio, work);
633 int ret;
634
635 ret = btrfs_bio_csum(async->bbio);
636 if (ret)
637 async->bbio->bio.bi_status = errno_to_blk_status(ret);
638 }
639
640 /*
641 * In order to insert checksums into the metadata in large chunks, we wait
642 * until bio submission time. All the pages in the bio are checksummed and
643 * sums are attached onto the ordered extent record.
644 *
645 * At IO completion time the csums attached on the ordered extent record are
646 * inserted into the tree.
647 *
648 * If called with @do_free == true, then it will free the work struct.
649 */
run_one_async_done(struct btrfs_work * work,bool do_free)650 static void run_one_async_done(struct btrfs_work *work, bool do_free)
651 {
652 struct async_submit_bio *async =
653 container_of(work, struct async_submit_bio, work);
654 struct bio *bio = &async->bbio->bio;
655
656 if (do_free) {
657 kfree(container_of(work, struct async_submit_bio, work));
658 return;
659 }
660
661 /* If an error occurred we just want to clean up the bio and move on. */
662 if (bio->bi_status) {
663 btrfs_bio_end_io(async->bbio, bio->bi_status);
664 return;
665 }
666
667 /*
668 * All of the bios that pass through here are from async helpers.
669 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
670 * context. This changes nothing when cgroups aren't in use.
671 */
672 bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
673 btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
674 }
675
should_async_write(struct btrfs_bio * bbio)676 static bool should_async_write(struct btrfs_bio *bbio)
677 {
678 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
679 bool auto_csum_mode = true;
680
681 #ifdef CONFIG_BTRFS_EXPERIMENTAL
682 /*
683 * Write bios will calculate checksum and submit bio at the same time.
684 * Unless explicitly required don't offload serial csum calculate and bio
685 * submit into a workqueue.
686 */
687 return false;
688 #endif
689
690 /* Submit synchronously if the checksum implementation is fast. */
691 if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
692 return false;
693
694 /*
695 * Try to defer the submission to a workqueue to parallelize the
696 * checksum calculation unless the I/O is issued synchronously.
697 */
698 if (op_is_sync(bbio->bio.bi_opf))
699 return false;
700
701 /* Zoned devices require I/O to be submitted in order. */
702 if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info))
703 return false;
704
705 return true;
706 }
707
708 /*
709 * Submit bio to an async queue.
710 *
711 * Return true if the work has been successfully submitted, else false.
712 */
btrfs_wq_submit_bio(struct btrfs_bio * bbio,struct btrfs_io_context * bioc,struct btrfs_io_stripe * smap,int mirror_num)713 static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
714 struct btrfs_io_context *bioc,
715 struct btrfs_io_stripe *smap, int mirror_num)
716 {
717 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
718 struct async_submit_bio *async;
719
720 async = kmalloc_obj(*async, GFP_NOFS);
721 if (!async)
722 return false;
723
724 async->bbio = bbio;
725 async->bioc = bioc;
726 async->smap = *smap;
727 async->mirror_num = mirror_num;
728
729 btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
730 btrfs_queue_work(fs_info->workers, &async->work);
731 return true;
732 }
733
btrfs_append_map_length(struct btrfs_bio * bbio,u64 map_length)734 static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
735 {
736 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
737 unsigned int nr_segs;
738 int sector_offset;
739
740 map_length = min(map_length, fs_info->max_zone_append_size);
741 sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits,
742 &nr_segs, map_length);
743 if (sector_offset) {
744 /*
745 * bio_split_rw_at() could split at a size smaller than our
746 * sectorsize and thus cause unaligned I/Os. Fix that by
747 * always rounding down to the nearest boundary.
748 */
749 return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize);
750 }
751 return map_length;
752 }
753
btrfs_submit_chunk(struct btrfs_bio * bbio,int mirror_num)754 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
755 {
756 struct btrfs_inode *inode = bbio->inode;
757 struct btrfs_fs_info *fs_info = inode->root->fs_info;
758 struct bio *bio = &bbio->bio;
759 u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
760 u64 length = bio->bi_iter.bi_size;
761 u64 map_length = length;
762 struct btrfs_io_context *bioc = NULL;
763 struct btrfs_io_stripe smap;
764 blk_status_t status;
765 int ret;
766
767 if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root))
768 smap.rst_search_commit_root = true;
769 else
770 smap.rst_search_commit_root = false;
771
772 btrfs_bio_counter_inc_blocked(fs_info);
773 ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
774 &bioc, &smap, &mirror_num);
775 if (ret) {
776 status = errno_to_blk_status(ret);
777 btrfs_bio_counter_dec(fs_info);
778 goto end_bbio;
779 }
780
781 /*
782 * For fscrypt writes we will get the encrypted bio after we've remapped
783 * our bio to the physical disk location, so we need to save the
784 * original bytenr so we know what we're checksumming.
785 */
786 if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio))
787 bbio->orig_logical = logical;
788
789 bbio->can_use_append = btrfs_use_zone_append(bbio);
790
791 map_length = min(map_length, length);
792 if (bbio->can_use_append)
793 map_length = btrfs_append_map_length(bbio, map_length);
794
795 if (map_length < length) {
796 struct btrfs_bio *split;
797
798 split = btrfs_split_bio(fs_info, bbio, map_length);
799 if (IS_ERR(split)) {
800 status = errno_to_blk_status(PTR_ERR(split));
801 btrfs_bio_counter_dec(fs_info);
802 goto end_bbio;
803 }
804 bbio = split;
805 bio = &bbio->bio;
806 }
807
808 /*
809 * Save the iter for the end_io handler and preload the checksums for
810 * data reads.
811 */
812 if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
813 bbio->saved_iter = bio->bi_iter;
814 ret = btrfs_lookup_bio_sums(bbio);
815 status = errno_to_blk_status(ret);
816 if (status)
817 goto fail;
818 }
819
820 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
821 if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
822 /*
823 * No locking for the list update, as we only add to
824 * the list in the I/O submission path, and list
825 * iteration only happens in the completion path, which
826 * can't happen until after the last submission.
827 */
828 btrfs_get_bioc(bioc);
829 list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
830 }
831
832 /*
833 * Csum items for reloc roots have already been cloned at this
834 * point, so they are handled as part of the no-checksum case.
835 */
836 if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
837 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
838 !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) {
839 if (should_async_write(bbio) &&
840 btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
841 goto done;
842
843 ret = btrfs_bio_csum(bbio);
844 status = errno_to_blk_status(ret);
845 if (status)
846 goto fail;
847 } else if (bbio->can_use_append ||
848 (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) {
849 ret = btrfs_alloc_dummy_sum(bbio);
850 status = errno_to_blk_status(ret);
851 if (status)
852 goto fail;
853 }
854 }
855
856 btrfs_submit_bio(bio, bioc, &smap, mirror_num);
857 done:
858 return map_length == length;
859
860 fail:
861 btrfs_bio_counter_dec(fs_info);
862 /*
863 * We have split the original bbio, now we have to end both the current
864 * @bbio and remaining one, as the remaining one will never be submitted.
865 */
866 if (map_length < length) {
867 struct btrfs_bio *remaining = bbio->private;
868
869 ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
870 ASSERT(remaining);
871
872 btrfs_bio_end_io(remaining, status);
873 }
874 end_bbio:
875 btrfs_bio_end_io(bbio, status);
876 /* Do not submit another chunk */
877 return true;
878 }
879
assert_bbio_alignment(struct btrfs_bio * bbio)880 static void assert_bbio_alignment(struct btrfs_bio *bbio)
881 {
882 #ifdef CONFIG_BTRFS_ASSERT
883 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
884 struct bio_vec bvec;
885 struct bvec_iter iter;
886 const u32 blocksize = fs_info->sectorsize;
887 const u32 alignment = min(blocksize, PAGE_SIZE);
888 const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
889 const u32 length = bbio->bio.bi_iter.bi_size;
890
891 /* The logical and length should still be aligned to blocksize. */
892 ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) &&
893 length != 0, "root=%llu inode=%llu logical=%llu length=%u",
894 btrfs_root_id(bbio->inode->root),
895 btrfs_ino(bbio->inode), logical, length);
896
897 bio_for_each_bvec(bvec, &bbio->bio, iter)
898 ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) &&
899 IS_ALIGNED(bvec.bv_len, alignment),
900 "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
901 btrfs_root_id(bbio->inode->root),
902 btrfs_ino(bbio->inode), logical, length, iter.bi_idx,
903 bvec.bv_offset, bvec.bv_len);
904 #endif
905 }
906
btrfs_submit_bbio(struct btrfs_bio * bbio,int mirror_num)907 void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
908 {
909 /* If bbio->inode is not populated, its file_offset must be 0. */
910 ASSERT(bbio->inode || bbio->file_offset == 0);
911
912 assert_bbio_alignment(bbio);
913
914 while (!btrfs_submit_chunk(bbio, mirror_num))
915 ;
916 }
917
918 /*
919 * Submit a repair write.
920 *
921 * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
922 * RAID setup. Here we only want to write the one bad copy, so we do the
923 * mapping ourselves and submit the bio directly.
924 *
925 * The I/O is issued synchronously to block the repair read completion from
926 * freeing the bio.
927 *
928 * @ino: Offending inode number
929 * @fileoff: File offset inside the inode
930 * @length: Length of the repair write
931 * @logical: Logical address of the range
932 * @paddrs: Physical address array of the content
933 * @step: Length of for each paddrs
934 * @mirror_num: Mirror number to write to. Must not be zero
935 */
btrfs_repair_io_failure(struct btrfs_fs_info * fs_info,u64 ino,u64 fileoff,u32 length,u64 logical,const phys_addr_t paddrs[],unsigned int step,int mirror_num)936 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
937 u32 length, u64 logical, const phys_addr_t paddrs[],
938 unsigned int step, int mirror_num)
939 {
940 const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
941 struct btrfs_io_stripe smap = { 0 };
942 struct bio *bio = NULL;
943 int ret = 0;
944
945 BUG_ON(!mirror_num);
946
947 /* Basic alignment checks. */
948 ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
949 ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
950 ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
951 /* Either it's a single data or metadata block. */
952 ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
953 ASSERT(step <= length);
954 ASSERT(is_power_of_2(step));
955
956 /*
957 * The fs either mounted RO or hit critical errors, no need
958 * to continue repairing.
959 */
960 if (unlikely(sb_rdonly(fs_info->sb)))
961 return 0;
962
963 if (btrfs_repair_one_zone(fs_info, logical))
964 return 0;
965
966 /*
967 * Avoid races with device replace and make sure our bioc has devices
968 * associated to its stripes that don't go away while we are doing the
969 * read repair operation.
970 */
971 btrfs_bio_counter_inc_blocked(fs_info);
972 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
973 if (ret < 0)
974 goto out_counter_dec;
975
976 if (unlikely(!smap.dev->bdev ||
977 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
978 ret = -EIO;
979 goto out_counter_dec;
980 }
981
982 bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
983 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
984 for (int i = 0; i < nr_steps; i++) {
985 ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
986 /* We should have allocated enough slots to contain all the different pages. */
987 ASSERT(ret == step);
988 }
989 ret = submit_bio_wait(bio);
990 bio_put(bio);
991 if (ret) {
992 /* try to remap that extent elsewhere? */
993 btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
994 goto out_counter_dec;
995 }
996
997 btrfs_info_rl(fs_info,
998 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
999 ino, fileoff, btrfs_dev_name(smap.dev),
1000 smap.physical >> SECTOR_SHIFT);
1001 ret = 0;
1002
1003 out_counter_dec:
1004 btrfs_bio_counter_dec(fs_info);
1005 return ret;
1006 }
1007
1008 /*
1009 * Submit a btrfs_bio based repair write.
1010 *
1011 * If @dev_replace is true, the write would be submitted to dev-replace target.
1012 */
btrfs_submit_repair_write(struct btrfs_bio * bbio,int mirror_num,bool dev_replace)1013 void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
1014 {
1015 struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
1016 u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1017 u64 length = bbio->bio.bi_iter.bi_size;
1018 struct btrfs_io_stripe smap = { 0 };
1019 int ret;
1020
1021 ASSERT(mirror_num > 0);
1022 ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
1023 ASSERT(!is_data_inode(bbio->inode));
1024 ASSERT(bbio->is_scrub);
1025
1026 btrfs_bio_counter_inc_blocked(fs_info);
1027 ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
1028 if (ret < 0)
1029 goto fail;
1030
1031 if (dev_replace) {
1032 ASSERT(smap.dev == fs_info->dev_replace.srcdev);
1033 smap.dev = fs_info->dev_replace.tgtdev;
1034 }
1035 btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
1036 return;
1037
1038 fail:
1039 btrfs_bio_counter_dec(fs_info);
1040 btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
1041 }
1042
btrfs_bioset_init(void)1043 int __init btrfs_bioset_init(void)
1044 {
1045 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
1046 offsetof(struct btrfs_bio, bio),
1047 BIOSET_NEED_BVECS))
1048 return -ENOMEM;
1049 if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
1050 offsetof(struct btrfs_bio, bio), 0))
1051 goto out;
1052 if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
1053 offsetof(struct btrfs_bio, bio),
1054 BIOSET_NEED_BVECS))
1055 goto out;
1056 if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
1057 sizeof(struct btrfs_failed_bio)))
1058 goto out;
1059 return 0;
1060
1061 out:
1062 btrfs_bioset_exit();
1063 return -ENOMEM;
1064 }
1065
btrfs_bioset_exit(void)1066 void __cold btrfs_bioset_exit(void)
1067 {
1068 mempool_exit(&btrfs_failed_bio_pool);
1069 bioset_exit(&btrfs_repair_bioset);
1070 bioset_exit(&btrfs_clone_bioset);
1071 bioset_exit(&btrfs_bioset);
1072 }
1073