1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Some low level IO code, and hacks for various block layer limitations
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9 #include "bcachefs.h"
10 #include "alloc_background.h"
11 #include "alloc_foreground.h"
12 #include "btree_update.h"
13 #include "buckets.h"
14 #include "checksum.h"
15 #include "clock.h"
16 #include "compress.h"
17 #include "data_update.h"
18 #include "disk_groups.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "io_read.h"
22 #include "io_misc.h"
23 #include "io_write.h"
24 #include "reflink.h"
25 #include "subvolume.h"
26 #include "trace.h"
27
28 #include <linux/random.h>
29 #include <linux/sched/mm.h>
30
31 #ifdef CONFIG_BCACHEFS_DEBUG
32 static unsigned bch2_read_corrupt_ratio;
33 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
34 MODULE_PARM_DESC(read_corrupt_ratio, "");
35 #endif
36
37 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
38
bch2_target_congested(struct bch_fs * c,u16 target)39 static bool bch2_target_congested(struct bch_fs *c, u16 target)
40 {
41 const struct bch_devs_mask *devs;
42 unsigned d, nr = 0, total = 0;
43 u64 now = local_clock(), last;
44 s64 congested;
45 struct bch_dev *ca;
46
47 if (!target)
48 return false;
49
50 rcu_read_lock();
51 devs = bch2_target_to_mask(c, target) ?:
52 &c->rw_devs[BCH_DATA_user];
53
54 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
55 ca = rcu_dereference(c->devs[d]);
56 if (!ca)
57 continue;
58
59 congested = atomic_read(&ca->congested);
60 last = READ_ONCE(ca->congested_last);
61 if (time_after64(now, last))
62 congested -= (now - last) >> 12;
63
64 total += max(congested, 0LL);
65 nr++;
66 }
67 rcu_read_unlock();
68
69 return get_random_u32_below(nr * CONGESTED_MAX) < total;
70 }
71
72 #else
73
bch2_target_congested(struct bch_fs * c,u16 target)74 static bool bch2_target_congested(struct bch_fs *c, u16 target)
75 {
76 return false;
77 }
78
79 #endif
80
81 /* Cache promotion on read */
82
83 struct promote_op {
84 struct rcu_head rcu;
85 u64 start_time;
86
87 struct rhash_head hash;
88 struct bpos pos;
89
90 struct work_struct work;
91 struct data_update write;
92 struct bio_vec bi_inline_vecs[]; /* must be last */
93 };
94
95 static const struct rhashtable_params bch_promote_params = {
96 .head_offset = offsetof(struct promote_op, hash),
97 .key_offset = offsetof(struct promote_op, pos),
98 .key_len = sizeof(struct bpos),
99 .automatic_shrinking = true,
100 };
101
have_io_error(struct bch_io_failures * failed)102 static inline bool have_io_error(struct bch_io_failures *failed)
103 {
104 return failed && failed->nr;
105 }
106
rbio_data_update(struct bch_read_bio * rbio)107 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
108 {
109 EBUG_ON(rbio->split);
110
111 return rbio->data_update
112 ? container_of(rbio, struct data_update, rbio)
113 : NULL;
114 }
115
ptr_being_rewritten(struct bch_read_bio * orig,unsigned dev)116 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
117 {
118 struct data_update *u = rbio_data_update(orig);
119 if (!u)
120 return false;
121
122 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
123 unsigned i = 0;
124 bkey_for_each_ptr(ptrs, ptr) {
125 if (ptr->dev == dev &&
126 u->data_opts.rewrite_ptrs & BIT(i))
127 return true;
128 i++;
129 }
130
131 return false;
132 }
133
should_promote(struct bch_fs * c,struct bkey_s_c k,struct bpos pos,struct bch_io_opts opts,unsigned flags,struct bch_io_failures * failed)134 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
135 struct bpos pos,
136 struct bch_io_opts opts,
137 unsigned flags,
138 struct bch_io_failures *failed)
139 {
140 if (!have_io_error(failed)) {
141 BUG_ON(!opts.promote_target);
142
143 if (!(flags & BCH_READ_may_promote))
144 return -BCH_ERR_nopromote_may_not;
145
146 if (bch2_bkey_has_target(c, k, opts.promote_target))
147 return -BCH_ERR_nopromote_already_promoted;
148
149 if (bkey_extent_is_unwritten(k))
150 return -BCH_ERR_nopromote_unwritten;
151
152 if (bch2_target_congested(c, opts.promote_target))
153 return -BCH_ERR_nopromote_congested;
154 }
155
156 if (rhashtable_lookup_fast(&c->promote_table, &pos,
157 bch_promote_params))
158 return -BCH_ERR_nopromote_in_flight;
159
160 return 0;
161 }
162
promote_free(struct bch_read_bio * rbio)163 static noinline void promote_free(struct bch_read_bio *rbio)
164 {
165 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
166 struct bch_fs *c = rbio->c;
167
168 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
169 bch_promote_params);
170 BUG_ON(ret);
171
172 bch2_data_update_exit(&op->write);
173
174 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
175 kfree_rcu(op, rcu);
176 }
177
promote_done(struct bch_write_op * wop)178 static void promote_done(struct bch_write_op *wop)
179 {
180 struct promote_op *op = container_of(wop, struct promote_op, write.op);
181 struct bch_fs *c = op->write.rbio.c;
182
183 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
184 promote_free(&op->write.rbio);
185 }
186
promote_start_work(struct work_struct * work)187 static void promote_start_work(struct work_struct *work)
188 {
189 struct promote_op *op = container_of(work, struct promote_op, work);
190
191 bch2_data_update_read_done(&op->write);
192 }
193
promote_start(struct bch_read_bio * rbio)194 static noinline void promote_start(struct bch_read_bio *rbio)
195 {
196 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
197
198 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
199
200 INIT_WORK(&op->work, promote_start_work);
201 queue_work(rbio->c->write_ref_wq, &op->work);
202 }
203
__promote_alloc(struct btree_trans * trans,enum btree_id btree_id,struct bkey_s_c k,struct bpos pos,struct extent_ptr_decoded * pick,unsigned sectors,struct bch_read_bio * orig,struct bch_io_failures * failed)204 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
205 enum btree_id btree_id,
206 struct bkey_s_c k,
207 struct bpos pos,
208 struct extent_ptr_decoded *pick,
209 unsigned sectors,
210 struct bch_read_bio *orig,
211 struct bch_io_failures *failed)
212 {
213 struct bch_fs *c = trans->c;
214 int ret;
215
216 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
217
218 if (!have_io_error(failed)) {
219 update_opts.target = orig->opts.promote_target;
220 update_opts.extra_replicas = 1;
221 update_opts.write_flags |= BCH_WRITE_cached;
222 update_opts.write_flags |= BCH_WRITE_only_specified_devs;
223 } else {
224 update_opts.target = orig->opts.foreground_target;
225
226 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
227 unsigned ptr_bit = 1;
228 bkey_for_each_ptr(ptrs, ptr) {
229 if (bch2_dev_io_failures(failed, ptr->dev) &&
230 !ptr_being_rewritten(orig, ptr->dev))
231 update_opts.rewrite_ptrs |= ptr_bit;
232 ptr_bit <<= 1;
233 }
234
235 if (!update_opts.rewrite_ptrs)
236 return NULL;
237 }
238
239 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
240 return ERR_PTR(-BCH_ERR_nopromote_no_writes);
241
242 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
243 if (!op) {
244 ret = -BCH_ERR_nopromote_enomem;
245 goto err_put;
246 }
247
248 op->start_time = local_clock();
249 op->pos = pos;
250
251 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
252 bch_promote_params)) {
253 ret = -BCH_ERR_nopromote_in_flight;
254 goto err;
255 }
256
257 ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
258 writepoint_hashed((unsigned long) current),
259 &orig->opts,
260 update_opts,
261 btree_id, k);
262 op->write.type = BCH_DATA_UPDATE_promote;
263 /*
264 * possible errors: -BCH_ERR_nocow_lock_blocked,
265 * -BCH_ERR_ENOSPC_disk_reservation:
266 */
267 if (ret)
268 goto err_remove_hash;
269
270 rbio_init_fragment(&op->write.rbio.bio, orig);
271 op->write.rbio.bounce = true;
272 op->write.rbio.promote = true;
273 op->write.op.end_io = promote_done;
274
275 return &op->write.rbio;
276 err_remove_hash:
277 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
278 bch_promote_params));
279 err:
280 bio_free_pages(&op->write.op.wbio.bio);
281 /* We may have added to the rhashtable and thus need rcu freeing: */
282 kfree_rcu(op, rcu);
283 err_put:
284 bch2_write_ref_put(c, BCH_WRITE_REF_promote);
285 return ERR_PTR(ret);
286 }
287
288 noinline
promote_alloc(struct btree_trans * trans,struct bvec_iter iter,struct bkey_s_c k,struct extent_ptr_decoded * pick,unsigned flags,struct bch_read_bio * orig,bool * bounce,bool * read_full,struct bch_io_failures * failed)289 static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
290 struct bvec_iter iter,
291 struct bkey_s_c k,
292 struct extent_ptr_decoded *pick,
293 unsigned flags,
294 struct bch_read_bio *orig,
295 bool *bounce,
296 bool *read_full,
297 struct bch_io_failures *failed)
298 {
299 struct bch_fs *c = trans->c;
300 /*
301 * if failed != NULL we're not actually doing a promote, we're
302 * recovering from an io/checksum error
303 */
304 bool promote_full = (have_io_error(failed) ||
305 *read_full ||
306 READ_ONCE(c->opts.promote_whole_extents));
307 /* data might have to be decompressed in the write path: */
308 unsigned sectors = promote_full
309 ? max(pick->crc.compressed_size, pick->crc.live_size)
310 : bvec_iter_sectors(iter);
311 struct bpos pos = promote_full
312 ? bkey_start_pos(k.k)
313 : POS(k.k->p.inode, iter.bi_sector);
314 int ret;
315
316 ret = should_promote(c, k, pos, orig->opts, flags, failed);
317 if (ret)
318 goto nopromote;
319
320 struct bch_read_bio *promote =
321 __promote_alloc(trans,
322 k.k->type == KEY_TYPE_reflink_v
323 ? BTREE_ID_reflink
324 : BTREE_ID_extents,
325 k, pos, pick, sectors, orig, failed);
326 if (!promote)
327 return NULL;
328
329 ret = PTR_ERR_OR_ZERO(promote);
330 if (ret)
331 goto nopromote;
332
333 *bounce = true;
334 *read_full = promote_full;
335 return promote;
336 nopromote:
337 trace_io_read_nopromote(c, ret);
338 return NULL;
339 }
340
341 /* Read */
342
bch2_read_err_msg_trans(struct btree_trans * trans,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)343 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
344 struct bch_read_bio *rbio, struct bpos read_pos)
345 {
346 int ret = lockrestart_do(trans,
347 bch2_inum_offset_err_msg_trans(trans, out,
348 (subvol_inum) { rbio->subvol, read_pos.inode },
349 read_pos.offset << 9));
350 if (ret)
351 return ret;
352
353 if (rbio->data_update)
354 prt_str(out, "(internal move) ");
355
356 return 0;
357 }
358
bch2_read_err_msg(struct bch_fs * c,struct printbuf * out,struct bch_read_bio * rbio,struct bpos read_pos)359 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
360 struct bch_read_bio *rbio, struct bpos read_pos)
361 {
362 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
363 }
364
365 enum rbio_context {
366 RBIO_CONTEXT_NULL,
367 RBIO_CONTEXT_HIGHPRI,
368 RBIO_CONTEXT_UNBOUND,
369 };
370
371 static inline struct bch_read_bio *
bch2_rbio_parent(struct bch_read_bio * rbio)372 bch2_rbio_parent(struct bch_read_bio *rbio)
373 {
374 return rbio->split ? rbio->parent : rbio;
375 }
376
377 __always_inline
bch2_rbio_punt(struct bch_read_bio * rbio,work_func_t fn,enum rbio_context context,struct workqueue_struct * wq)378 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
379 enum rbio_context context,
380 struct workqueue_struct *wq)
381 {
382 if (context <= rbio->context) {
383 fn(&rbio->work);
384 } else {
385 rbio->work.func = fn;
386 rbio->context = context;
387 queue_work(wq, &rbio->work);
388 }
389 }
390
bch2_rbio_free(struct bch_read_bio * rbio)391 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
392 {
393 BUG_ON(rbio->bounce && !rbio->split);
394
395 if (rbio->have_ioref) {
396 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
397 percpu_ref_put(&ca->io_ref[READ]);
398 }
399
400 if (rbio->split) {
401 struct bch_read_bio *parent = rbio->parent;
402
403 if (unlikely(rbio->promote)) {
404 if (!rbio->bio.bi_status)
405 promote_start(rbio);
406 else
407 promote_free(rbio);
408 } else {
409 if (rbio->bounce)
410 bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
411
412 bio_put(&rbio->bio);
413 }
414
415 rbio = parent;
416 }
417
418 return rbio;
419 }
420
421 /*
422 * Only called on a top level bch_read_bio to complete an entire read request,
423 * not a split:
424 */
bch2_rbio_done(struct bch_read_bio * rbio)425 static void bch2_rbio_done(struct bch_read_bio *rbio)
426 {
427 if (rbio->start_time)
428 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
429 rbio->start_time);
430 bio_endio(&rbio->bio);
431 }
432
bch2_read_retry_nodecode(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,struct bch_io_failures * failed,unsigned flags)433 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
434 struct bch_read_bio *rbio,
435 struct bvec_iter bvec_iter,
436 struct bch_io_failures *failed,
437 unsigned flags)
438 {
439 struct data_update *u = container_of(rbio, struct data_update, rbio);
440 retry:
441 bch2_trans_begin(trans);
442
443 struct btree_iter iter;
444 struct bkey_s_c k;
445 int ret = lockrestart_do(trans,
446 bkey_err(k = bch2_bkey_get_iter(trans, &iter,
447 u->btree_id, bkey_start_pos(&u->k.k->k),
448 0)));
449 if (ret)
450 goto err;
451
452 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
453 /* extent we wanted to read no longer exists: */
454 rbio->ret = -BCH_ERR_data_read_key_overwritten;
455 goto err;
456 }
457
458 ret = __bch2_read_extent(trans, rbio, bvec_iter,
459 bkey_start_pos(&u->k.k->k),
460 u->btree_id,
461 bkey_i_to_s_c(u->k.k),
462 0, failed, flags, -1);
463 err:
464 bch2_trans_iter_exit(trans, &iter);
465
466 if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
467 goto retry;
468
469 if (ret) {
470 rbio->bio.bi_status = BLK_STS_IOERR;
471 rbio->ret = ret;
472 }
473
474 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
475 return ret;
476 }
477
bch2_rbio_retry(struct work_struct * work)478 static void bch2_rbio_retry(struct work_struct *work)
479 {
480 struct bch_read_bio *rbio =
481 container_of(work, struct bch_read_bio, work);
482 struct bch_fs *c = rbio->c;
483 struct bvec_iter iter = rbio->bvec_iter;
484 unsigned flags = rbio->flags;
485 subvol_inum inum = {
486 .subvol = rbio->subvol,
487 .inum = rbio->read_pos.inode,
488 };
489 struct bch_io_failures failed = { .nr = 0 };
490 int orig_error = rbio->ret;
491
492 struct btree_trans *trans = bch2_trans_get(c);
493
494 trace_io_read_retry(&rbio->bio);
495 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
496 bvec_iter_sectors(rbio->bvec_iter));
497
498 if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
499 bch2_mark_io_failure(&failed, &rbio->pick,
500 rbio->ret == -BCH_ERR_data_read_retry_csum_err);
501
502 if (!rbio->split) {
503 rbio->bio.bi_status = 0;
504 rbio->ret = 0;
505 }
506
507 unsigned subvol = rbio->subvol;
508 struct bpos read_pos = rbio->read_pos;
509
510 rbio = bch2_rbio_free(rbio);
511
512 flags |= BCH_READ_in_retry;
513 flags &= ~BCH_READ_may_promote;
514 flags &= ~BCH_READ_last_fragment;
515 flags |= BCH_READ_must_clone;
516
517 int ret = rbio->data_update
518 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
519 : __bch2_read(trans, rbio, iter, inum, &failed, flags);
520
521 if (ret) {
522 rbio->ret = ret;
523 rbio->bio.bi_status = BLK_STS_IOERR;
524 } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
525 orig_error != -BCH_ERR_data_read_ptr_stale_race &&
526 !failed.nr) {
527 struct printbuf buf = PRINTBUF;
528
529 lockrestart_do(trans,
530 bch2_inum_offset_err_msg_trans(trans, &buf,
531 (subvol_inum) { subvol, read_pos.inode },
532 read_pos.offset << 9));
533 if (rbio->data_update)
534 prt_str(&buf, "(internal move) ");
535 prt_str(&buf, "successful retry");
536
537 bch_err_ratelimited(c, "%s", buf.buf);
538 printbuf_exit(&buf);
539 }
540
541 bch2_rbio_done(rbio);
542 bch2_trans_put(trans);
543 }
544
bch2_rbio_error(struct bch_read_bio * rbio,int ret,blk_status_t blk_error)545 static void bch2_rbio_error(struct bch_read_bio *rbio,
546 int ret, blk_status_t blk_error)
547 {
548 BUG_ON(ret >= 0);
549
550 rbio->ret = ret;
551 rbio->bio.bi_status = blk_error;
552
553 bch2_rbio_parent(rbio)->saw_error = true;
554
555 if (rbio->flags & BCH_READ_in_retry)
556 return;
557
558 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
559 bch2_rbio_punt(rbio, bch2_rbio_retry,
560 RBIO_CONTEXT_UNBOUND, system_unbound_wq);
561 } else {
562 rbio = bch2_rbio_free(rbio);
563
564 rbio->ret = ret;
565 rbio->bio.bi_status = blk_error;
566
567 bch2_rbio_done(rbio);
568 }
569 }
570
bch2_read_io_err(struct work_struct * work)571 static void bch2_read_io_err(struct work_struct *work)
572 {
573 struct bch_read_bio *rbio =
574 container_of(work, struct bch_read_bio, work);
575 struct bio *bio = &rbio->bio;
576 struct bch_fs *c = rbio->c;
577 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
578 struct printbuf buf = PRINTBUF;
579
580 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
581 prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
582
583 if (ca)
584 bch_err_ratelimited(ca, "%s", buf.buf);
585 else
586 bch_err_ratelimited(c, "%s", buf.buf);
587
588 printbuf_exit(&buf);
589 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
590 }
591
__bch2_rbio_narrow_crcs(struct btree_trans * trans,struct bch_read_bio * rbio)592 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
593 struct bch_read_bio *rbio)
594 {
595 struct bch_fs *c = rbio->c;
596 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
597 struct bch_extent_crc_unpacked new_crc;
598 struct btree_iter iter;
599 struct bkey_i *new;
600 struct bkey_s_c k;
601 int ret = 0;
602
603 if (crc_is_compressed(rbio->pick.crc))
604 return 0;
605
606 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
607 BTREE_ITER_slots|BTREE_ITER_intent);
608 if ((ret = bkey_err(k)))
609 goto out;
610
611 if (bversion_cmp(k.k->bversion, rbio->version) ||
612 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
613 goto out;
614
615 /* Extent was merged? */
616 if (bkey_start_offset(k.k) < data_offset ||
617 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
618 goto out;
619
620 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
621 rbio->pick.crc, NULL, &new_crc,
622 bkey_start_offset(k.k) - data_offset, k.k->size,
623 rbio->pick.crc.csum_type)) {
624 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
625 ret = 0;
626 goto out;
627 }
628
629 /*
630 * going to be temporarily appending another checksum entry:
631 */
632 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
633 sizeof(struct bch_extent_crc128));
634 if ((ret = PTR_ERR_OR_ZERO(new)))
635 goto out;
636
637 bkey_reassemble(new, k);
638
639 if (!bch2_bkey_narrow_crcs(new, new_crc))
640 goto out;
641
642 ret = bch2_trans_update(trans, &iter, new,
643 BTREE_UPDATE_internal_snapshot_node);
644 out:
645 bch2_trans_iter_exit(trans, &iter);
646 return ret;
647 }
648
bch2_rbio_narrow_crcs(struct bch_read_bio * rbio)649 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
650 {
651 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
652 __bch2_rbio_narrow_crcs(trans, rbio));
653 }
654
bch2_read_csum_err(struct work_struct * work)655 static void bch2_read_csum_err(struct work_struct *work)
656 {
657 struct bch_read_bio *rbio =
658 container_of(work, struct bch_read_bio, work);
659 struct bch_fs *c = rbio->c;
660 struct bio *src = &rbio->bio;
661 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
662 struct nonce nonce = extent_nonce(rbio->version, crc);
663 struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
664 struct printbuf buf = PRINTBUF;
665
666 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
667 prt_str(&buf, "data ");
668 bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
669
670 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
671 if (ca)
672 bch_err_ratelimited(ca, "%s", buf.buf);
673 else
674 bch_err_ratelimited(c, "%s", buf.buf);
675
676 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
677 printbuf_exit(&buf);
678 }
679
bch2_read_decompress_err(struct work_struct * work)680 static void bch2_read_decompress_err(struct work_struct *work)
681 {
682 struct bch_read_bio *rbio =
683 container_of(work, struct bch_read_bio, work);
684 struct bch_fs *c = rbio->c;
685 struct printbuf buf = PRINTBUF;
686
687 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
688 prt_str(&buf, "decompression error");
689
690 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
691 if (ca)
692 bch_err_ratelimited(ca, "%s", buf.buf);
693 else
694 bch_err_ratelimited(c, "%s", buf.buf);
695
696 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
697 printbuf_exit(&buf);
698 }
699
bch2_read_decrypt_err(struct work_struct * work)700 static void bch2_read_decrypt_err(struct work_struct *work)
701 {
702 struct bch_read_bio *rbio =
703 container_of(work, struct bch_read_bio, work);
704 struct bch_fs *c = rbio->c;
705 struct printbuf buf = PRINTBUF;
706
707 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
708 prt_str(&buf, "decrypt error");
709
710 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
711 if (ca)
712 bch_err_ratelimited(ca, "%s", buf.buf);
713 else
714 bch_err_ratelimited(c, "%s", buf.buf);
715
716 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
717 printbuf_exit(&buf);
718 }
719
720 /* Inner part that may run in process context */
__bch2_read_endio(struct work_struct * work)721 static void __bch2_read_endio(struct work_struct *work)
722 {
723 struct bch_read_bio *rbio =
724 container_of(work, struct bch_read_bio, work);
725 struct bch_fs *c = rbio->c;
726 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
727 struct bch_read_bio *parent = bch2_rbio_parent(rbio);
728 struct bio *src = &rbio->bio;
729 struct bio *dst = &parent->bio;
730 struct bvec_iter dst_iter = rbio->bvec_iter;
731 struct bch_extent_crc_unpacked crc = rbio->pick.crc;
732 struct nonce nonce = extent_nonce(rbio->version, crc);
733 unsigned nofs_flags;
734 struct bch_csum csum;
735 int ret;
736
737 nofs_flags = memalloc_nofs_save();
738
739 /* Reset iterator for checksumming and copying bounced data: */
740 if (rbio->bounce) {
741 src->bi_iter.bi_size = crc.compressed_size << 9;
742 src->bi_iter.bi_idx = 0;
743 src->bi_iter.bi_bvec_done = 0;
744 } else {
745 src->bi_iter = rbio->bvec_iter;
746 }
747
748 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
749
750 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
751 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
752
753 /*
754 * Checksum error: if the bio wasn't bounced, we may have been
755 * reading into buffers owned by userspace (that userspace can
756 * scribble over) - retry the read, bouncing it this time:
757 */
758 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
759 rbio->flags |= BCH_READ_must_bounce;
760 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
761 BLK_STS_IOERR);
762 goto out;
763 }
764
765 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
766
767 if (!csum_good)
768 goto csum_err;
769
770 /*
771 * XXX
772 * We need to rework the narrow_crcs path to deliver the read completion
773 * first, and then punt to a different workqueue, otherwise we're
774 * holding up reads while doing btree updates which is bad for memory
775 * reclaim.
776 */
777 if (unlikely(rbio->narrow_crcs))
778 bch2_rbio_narrow_crcs(rbio);
779
780 if (likely(!parent->data_update)) {
781 /* Adjust crc to point to subset of data we want: */
782 crc.offset += rbio->offset_into_extent;
783 crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
784
785 if (crc_is_compressed(crc)) {
786 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
787 if (ret)
788 goto decrypt_err;
789
790 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
791 !c->opts.no_data_io)
792 goto decompression_err;
793 } else {
794 /* don't need to decrypt the entire bio: */
795 nonce = nonce_add(nonce, crc.offset << 9);
796 bio_advance(src, crc.offset << 9);
797
798 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
799 src->bi_iter.bi_size = dst_iter.bi_size;
800
801 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
802 if (ret)
803 goto decrypt_err;
804
805 if (rbio->bounce) {
806 struct bvec_iter src_iter = src->bi_iter;
807
808 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
809 }
810 }
811 } else {
812 if (rbio->split)
813 rbio->parent->pick = rbio->pick;
814
815 if (rbio->bounce) {
816 struct bvec_iter src_iter = src->bi_iter;
817
818 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
819 }
820 }
821
822 if (rbio->promote) {
823 /*
824 * Re encrypt data we decrypted, so it's consistent with
825 * rbio->crc:
826 */
827 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
828 if (ret)
829 goto decrypt_err;
830 }
831
832 if (likely(!(rbio->flags & BCH_READ_in_retry))) {
833 rbio = bch2_rbio_free(rbio);
834 bch2_rbio_done(rbio);
835 }
836 out:
837 memalloc_nofs_restore(nofs_flags);
838 return;
839 csum_err:
840 bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
841 goto out;
842 decompression_err:
843 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
844 goto out;
845 decrypt_err:
846 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
847 goto out;
848 }
849
bch2_read_endio(struct bio * bio)850 static void bch2_read_endio(struct bio *bio)
851 {
852 struct bch_read_bio *rbio =
853 container_of(bio, struct bch_read_bio, bio);
854 struct bch_fs *c = rbio->c;
855 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
856 struct workqueue_struct *wq = NULL;
857 enum rbio_context context = RBIO_CONTEXT_NULL;
858
859 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
860 rbio->submit_time, !bio->bi_status);
861
862 if (!rbio->split)
863 rbio->bio.bi_end_io = rbio->end_io;
864
865 if (unlikely(bio->bi_status)) {
866 bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
867 return;
868 }
869
870 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
871 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
872 trace_and_count(c, io_read_reuse_race, &rbio->bio);
873
874 if (rbio->flags & BCH_READ_retry_if_stale)
875 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
876 else
877 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
878 return;
879 }
880
881 if (rbio->narrow_crcs ||
882 rbio->promote ||
883 crc_is_compressed(rbio->pick.crc) ||
884 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
885 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
886 else if (rbio->pick.crc.csum_type)
887 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
888
889 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
890 }
891
read_from_stale_dirty_pointer(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,struct bch_extent_ptr ptr)892 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
893 struct bch_dev *ca,
894 struct bkey_s_c k,
895 struct bch_extent_ptr ptr)
896 {
897 struct bch_fs *c = trans->c;
898 struct btree_iter iter;
899 struct printbuf buf = PRINTBUF;
900 int ret;
901
902 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
903 PTR_BUCKET_POS(ca, &ptr),
904 BTREE_ITER_cached);
905
906 int gen = bucket_gen_get(ca, iter.pos.offset);
907 if (gen >= 0) {
908 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
909 printbuf_indent_add(&buf, 2);
910
911 bch2_bkey_val_to_text(&buf, c, k);
912 prt_newline(&buf);
913
914 prt_printf(&buf, "memory gen: %u", gen);
915
916 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
917 if (!ret) {
918 prt_newline(&buf);
919 bch2_bkey_val_to_text(&buf, c, k);
920 }
921 } else {
922 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
923 iter.pos.inode, iter.pos.offset);
924 printbuf_indent_add(&buf, 2);
925
926 prt_printf(&buf, "first bucket %u nbuckets %llu\n",
927 ca->mi.first_bucket, ca->mi.nbuckets);
928
929 bch2_bkey_val_to_text(&buf, c, k);
930 prt_newline(&buf);
931 }
932
933 bch2_fs_inconsistent(c, "%s", buf.buf);
934
935 bch2_trans_iter_exit(trans, &iter);
936 printbuf_exit(&buf);
937 }
938
__bch2_read_extent(struct btree_trans * trans,struct bch_read_bio * orig,struct bvec_iter iter,struct bpos read_pos,enum btree_id data_btree,struct bkey_s_c k,unsigned offset_into_extent,struct bch_io_failures * failed,unsigned flags,int dev)939 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
940 struct bvec_iter iter, struct bpos read_pos,
941 enum btree_id data_btree, struct bkey_s_c k,
942 unsigned offset_into_extent,
943 struct bch_io_failures *failed, unsigned flags, int dev)
944 {
945 struct bch_fs *c = trans->c;
946 struct extent_ptr_decoded pick;
947 struct bch_read_bio *rbio = NULL;
948 bool bounce = false, read_full = false, narrow_crcs = false;
949 struct bpos data_pos = bkey_start_pos(k.k);
950 struct data_update *u = rbio_data_update(orig);
951 int ret = 0;
952
953 if (bkey_extent_is_inline_data(k.k)) {
954 unsigned bytes = min_t(unsigned, iter.bi_size,
955 bkey_inline_data_bytes(k.k));
956
957 swap(iter.bi_size, bytes);
958 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
959 swap(iter.bi_size, bytes);
960 bio_advance_iter(&orig->bio, &iter, bytes);
961 zero_fill_bio_iter(&orig->bio, iter);
962 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
963 bvec_iter_sectors(iter));
964 goto out_read_done;
965 }
966 retry_pick:
967 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
968
969 /* hole or reservation - just zero fill: */
970 if (!ret)
971 goto hole;
972
973 if (unlikely(ret < 0)) {
974 struct printbuf buf = PRINTBUF;
975 bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
976 prt_printf(&buf, "%s\n ", bch2_err_str(ret));
977 bch2_bkey_val_to_text(&buf, c, k);
978
979 bch_err_ratelimited(c, "%s", buf.buf);
980 printbuf_exit(&buf);
981 goto err;
982 }
983
984 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
985 !c->chacha20_key_set) {
986 struct printbuf buf = PRINTBUF;
987 bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
988 prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
989 bch2_bkey_val_to_text(&buf, c, k);
990
991 bch_err_ratelimited(c, "%s", buf.buf);
992 printbuf_exit(&buf);
993 ret = -BCH_ERR_data_read_no_encryption_key;
994 goto err;
995 }
996
997 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
998
999 /*
1000 * Stale dirty pointers are treated as IO errors, but @failed isn't
1001 * allocated unless we're in the retry path - so if we're not in the
1002 * retry path, don't check here, it'll be caught in bch2_read_endio()
1003 * and we'll end up in the retry path:
1004 */
1005 if ((flags & BCH_READ_in_retry) &&
1006 !pick.ptr.cached &&
1007 ca &&
1008 unlikely(dev_ptr_stale(ca, &pick.ptr))) {
1009 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
1010 bch2_mark_io_failure(failed, &pick, false);
1011 percpu_ref_put(&ca->io_ref[READ]);
1012 goto retry_pick;
1013 }
1014
1015 if (likely(!u)) {
1016 if (!(flags & BCH_READ_last_fragment) ||
1017 bio_flagged(&orig->bio, BIO_CHAIN))
1018 flags |= BCH_READ_must_clone;
1019
1020 narrow_crcs = !(flags & BCH_READ_in_retry) &&
1021 bch2_can_narrow_extent_crcs(k, pick.crc);
1022
1023 if (narrow_crcs && (flags & BCH_READ_user_mapped))
1024 flags |= BCH_READ_must_bounce;
1025
1026 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
1027
1028 if (crc_is_compressed(pick.crc) ||
1029 (pick.crc.csum_type != BCH_CSUM_none &&
1030 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1031 (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
1032 (flags & BCH_READ_user_mapped)) ||
1033 (flags & BCH_READ_must_bounce)))) {
1034 read_full = true;
1035 bounce = true;
1036 }
1037 } else {
1038 /*
1039 * can happen if we retry, and the extent we were going to read
1040 * has been merged in the meantime:
1041 */
1042 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
1043 if (ca)
1044 percpu_ref_put(&ca->io_ref[READ]);
1045 rbio->ret = -BCH_ERR_data_read_buffer_too_small;
1046 goto out_read_done;
1047 }
1048
1049 iter.bi_size = pick.crc.compressed_size << 9;
1050 read_full = true;
1051 }
1052
1053 if (orig->opts.promote_target || have_io_error(failed))
1054 rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
1055 &bounce, &read_full, failed);
1056
1057 if (!read_full) {
1058 EBUG_ON(crc_is_compressed(pick.crc));
1059 EBUG_ON(pick.crc.csum_type &&
1060 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
1061 bvec_iter_sectors(iter) != pick.crc.live_size ||
1062 pick.crc.offset ||
1063 offset_into_extent));
1064
1065 data_pos.offset += offset_into_extent;
1066 pick.ptr.offset += pick.crc.offset +
1067 offset_into_extent;
1068 offset_into_extent = 0;
1069 pick.crc.compressed_size = bvec_iter_sectors(iter);
1070 pick.crc.uncompressed_size = bvec_iter_sectors(iter);
1071 pick.crc.offset = 0;
1072 pick.crc.live_size = bvec_iter_sectors(iter);
1073 }
1074
1075 if (rbio) {
1076 /*
1077 * promote already allocated bounce rbio:
1078 * promote needs to allocate a bio big enough for uncompressing
1079 * data in the write path, but we're not going to use it all
1080 * here:
1081 */
1082 EBUG_ON(rbio->bio.bi_iter.bi_size <
1083 pick.crc.compressed_size << 9);
1084 rbio->bio.bi_iter.bi_size =
1085 pick.crc.compressed_size << 9;
1086 } else if (bounce) {
1087 unsigned sectors = pick.crc.compressed_size;
1088
1089 rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
1090 DIV_ROUND_UP(sectors, PAGE_SECTORS),
1091 0,
1092 GFP_NOFS,
1093 &c->bio_read_split),
1094 orig);
1095
1096 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
1097 rbio->bounce = true;
1098 } else if (flags & BCH_READ_must_clone) {
1099 /*
1100 * Have to clone if there were any splits, due to error
1101 * reporting issues (if a split errored, and retrying didn't
1102 * work, when it reports the error to its parent (us) we don't
1103 * know if the error was from our bio, and we should retry, or
1104 * from the whole bio, in which case we don't want to retry and
1105 * lose the error)
1106 */
1107 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
1108 &c->bio_read_split),
1109 orig);
1110 rbio->bio.bi_iter = iter;
1111 } else {
1112 rbio = orig;
1113 rbio->bio.bi_iter = iter;
1114 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
1115 }
1116
1117 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
1118
1119 rbio->submit_time = local_clock();
1120 if (!rbio->split)
1121 rbio->end_io = orig->bio.bi_end_io;
1122 rbio->bvec_iter = iter;
1123 rbio->offset_into_extent= offset_into_extent;
1124 rbio->flags = flags;
1125 rbio->have_ioref = ca != NULL;
1126 rbio->narrow_crcs = narrow_crcs;
1127 rbio->ret = 0;
1128 rbio->context = 0;
1129 rbio->pick = pick;
1130 rbio->subvol = orig->subvol;
1131 rbio->read_pos = read_pos;
1132 rbio->data_btree = data_btree;
1133 rbio->data_pos = data_pos;
1134 rbio->version = k.k->bversion;
1135 INIT_WORK(&rbio->work, NULL);
1136
1137 rbio->bio.bi_opf = orig->bio.bi_opf;
1138 rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
1139 rbio->bio.bi_end_io = bch2_read_endio;
1140
1141 if (rbio->bounce)
1142 trace_and_count(c, io_read_bounce, &rbio->bio);
1143
1144 if (!u)
1145 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
1146 else
1147 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
1148 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
1149
1150 /*
1151 * If it's being moved internally, we don't want to flag it as a cache
1152 * hit:
1153 */
1154 if (ca && pick.ptr.cached && !u)
1155 bch2_bucket_io_time_reset(trans, pick.ptr.dev,
1156 PTR_BUCKET_NR(ca, &pick.ptr), READ);
1157
1158 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
1159 bio_inc_remaining(&orig->bio);
1160 trace_and_count(c, io_read_split, &orig->bio);
1161 }
1162
1163 /*
1164 * Unlock the iterator while the btree node's lock is still in
1165 * cache, before doing the IO:
1166 */
1167 if (!(flags & BCH_READ_in_retry))
1168 bch2_trans_unlock(trans);
1169 else
1170 bch2_trans_unlock_long(trans);
1171
1172 if (likely(!rbio->pick.do_ec_reconstruct)) {
1173 if (unlikely(!rbio->have_ioref)) {
1174 struct printbuf buf = PRINTBUF;
1175 bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
1176 prt_printf(&buf, "no device to read from:\n ");
1177 bch2_bkey_val_to_text(&buf, c, k);
1178
1179 bch_err_ratelimited(c, "%s", buf.buf);
1180 printbuf_exit(&buf);
1181
1182 bch2_rbio_error(rbio,
1183 -BCH_ERR_data_read_retry_device_offline,
1184 BLK_STS_IOERR);
1185 goto out;
1186 }
1187
1188 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
1189 bio_sectors(&rbio->bio));
1190 bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
1191
1192 if (unlikely(c->opts.no_data_io)) {
1193 if (likely(!(flags & BCH_READ_in_retry)))
1194 bio_endio(&rbio->bio);
1195 } else {
1196 if (likely(!(flags & BCH_READ_in_retry)))
1197 submit_bio(&rbio->bio);
1198 else
1199 submit_bio_wait(&rbio->bio);
1200 }
1201
1202 /*
1203 * We just submitted IO which may block, we expect relock fail
1204 * events and shouldn't count them:
1205 */
1206 trans->notrace_relock_fail = true;
1207 } else {
1208 /* Attempting reconstruct read: */
1209 if (bch2_ec_read_extent(trans, rbio, k)) {
1210 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
1211 BLK_STS_IOERR);
1212 goto out;
1213 }
1214
1215 if (likely(!(flags & BCH_READ_in_retry)))
1216 bio_endio(&rbio->bio);
1217 }
1218 out:
1219 if (likely(!(flags & BCH_READ_in_retry))) {
1220 return 0;
1221 } else {
1222 bch2_trans_unlock(trans);
1223
1224 int ret;
1225
1226 rbio->context = RBIO_CONTEXT_UNBOUND;
1227 bch2_read_endio(&rbio->bio);
1228
1229 ret = rbio->ret;
1230 rbio = bch2_rbio_free(rbio);
1231
1232 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
1233 bch2_mark_io_failure(failed, &pick,
1234 ret == -BCH_ERR_data_read_retry_csum_err);
1235
1236 return ret;
1237 }
1238
1239 err:
1240 if (flags & BCH_READ_in_retry)
1241 return ret;
1242
1243 orig->bio.bi_status = BLK_STS_IOERR;
1244 orig->ret = ret;
1245 goto out_read_done;
1246
1247 hole:
1248 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
1249 bvec_iter_sectors(iter));
1250 /*
1251 * won't normally happen in the data update (bch2_move_extent()) path,
1252 * but if we retry and the extent we wanted to read no longer exists we
1253 * have to signal that:
1254 */
1255 if (u)
1256 orig->ret = -BCH_ERR_data_read_key_overwritten;
1257
1258 zero_fill_bio_iter(&orig->bio, iter);
1259 out_read_done:
1260 if ((flags & BCH_READ_last_fragment) &&
1261 !(flags & BCH_READ_in_retry))
1262 bch2_rbio_done(orig);
1263 return 0;
1264 }
1265
__bch2_read(struct btree_trans * trans,struct bch_read_bio * rbio,struct bvec_iter bvec_iter,subvol_inum inum,struct bch_io_failures * failed,unsigned flags)1266 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
1267 struct bvec_iter bvec_iter, subvol_inum inum,
1268 struct bch_io_failures *failed, unsigned flags)
1269 {
1270 struct bch_fs *c = trans->c;
1271 struct btree_iter iter;
1272 struct bkey_buf sk;
1273 struct bkey_s_c k;
1274 int ret;
1275
1276 EBUG_ON(rbio->data_update);
1277
1278 bch2_bkey_buf_init(&sk);
1279 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1280 POS(inum.inum, bvec_iter.bi_sector),
1281 BTREE_ITER_slots);
1282
1283 while (1) {
1284 enum btree_id data_btree = BTREE_ID_extents;
1285
1286 bch2_trans_begin(trans);
1287
1288 u32 snapshot;
1289 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1290 if (ret)
1291 goto err;
1292
1293 bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
1294
1295 bch2_btree_iter_set_pos(trans, &iter,
1296 POS(inum.inum, bvec_iter.bi_sector));
1297
1298 k = bch2_btree_iter_peek_slot(trans, &iter);
1299 ret = bkey_err(k);
1300 if (ret)
1301 goto err;
1302
1303 s64 offset_into_extent = iter.pos.offset -
1304 bkey_start_offset(k.k);
1305 unsigned sectors = k.k->size - offset_into_extent;
1306
1307 bch2_bkey_buf_reassemble(&sk, c, k);
1308
1309 ret = bch2_read_indirect_extent(trans, &data_btree,
1310 &offset_into_extent, &sk);
1311 if (ret)
1312 goto err;
1313
1314 k = bkey_i_to_s_c(sk.k);
1315
1316 /*
1317 * With indirect extents, the amount of data to read is the min
1318 * of the original extent and the indirect extent:
1319 */
1320 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1321
1322 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
1323 swap(bvec_iter.bi_size, bytes);
1324
1325 if (bvec_iter.bi_size == bytes)
1326 flags |= BCH_READ_last_fragment;
1327
1328 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
1329 data_btree, k,
1330 offset_into_extent, failed, flags, -1);
1331 swap(bvec_iter.bi_size, bytes);
1332
1333 if (ret)
1334 goto err;
1335
1336 if (flags & BCH_READ_last_fragment)
1337 break;
1338
1339 bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
1340 err:
1341 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
1342 flags |= BCH_READ_must_bounce;
1343
1344 if (ret &&
1345 !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
1346 !bch2_err_matches(ret, BCH_ERR_data_read_retry))
1347 break;
1348 }
1349
1350 bch2_trans_iter_exit(trans, &iter);
1351
1352 if (unlikely(ret)) {
1353 if (ret != -BCH_ERR_extent_poisoned) {
1354 struct printbuf buf = PRINTBUF;
1355 lockrestart_do(trans,
1356 bch2_inum_offset_err_msg_trans(trans, &buf, inum,
1357 bvec_iter.bi_sector << 9));
1358 prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
1359 bch_err_ratelimited(c, "%s", buf.buf);
1360 printbuf_exit(&buf);
1361 }
1362
1363 rbio->bio.bi_status = BLK_STS_IOERR;
1364 rbio->ret = ret;
1365
1366 if (!(flags & BCH_READ_in_retry))
1367 bch2_rbio_done(rbio);
1368 }
1369
1370 bch2_bkey_buf_exit(&sk, c);
1371 return ret;
1372 }
1373
bch2_fs_io_read_exit(struct bch_fs * c)1374 void bch2_fs_io_read_exit(struct bch_fs *c)
1375 {
1376 if (c->promote_table.tbl)
1377 rhashtable_destroy(&c->promote_table);
1378 bioset_exit(&c->bio_read_split);
1379 bioset_exit(&c->bio_read);
1380 }
1381
bch2_fs_io_read_init(struct bch_fs * c)1382 int bch2_fs_io_read_init(struct bch_fs *c)
1383 {
1384 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
1385 BIOSET_NEED_BVECS))
1386 return -BCH_ERR_ENOMEM_bio_read_init;
1387
1388 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
1389 BIOSET_NEED_BVECS))
1390 return -BCH_ERR_ENOMEM_bio_read_split_init;
1391
1392 if (rhashtable_init(&c->promote_table, &bch_promote_params))
1393 return -BCH_ERR_ENOMEM_promote_table_init;
1394
1395 return 0;
1396 }
1397