1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "async_objs.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "clock.h" 17 #include "compress.h" 18 #include "data_update.h" 19 #include "disk_groups.h" 20 #include "ec.h" 21 #include "enumerated_ref.h" 22 #include "error.h" 23 #include "io_read.h" 24 #include "io_misc.h" 25 #include "io_write.h" 26 #include "reflink.h" 27 #include "subvolume.h" 28 #include "trace.h" 29 30 #include <linux/moduleparam.h> 31 #include <linux/random.h> 32 #include <linux/sched/mm.h> 33 34 #ifdef CONFIG_BCACHEFS_DEBUG 35 static unsigned bch2_read_corrupt_ratio; 36 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 37 MODULE_PARM_DESC(read_corrupt_ratio, ""); 38 #endif 39 40 static bool bch2_poison_extents_on_checksum_error; 41 module_param_named(poison_extents_on_checksum_error, 42 bch2_poison_extents_on_checksum_error, bool, 0644); 43 MODULE_PARM_DESC(poison_extents_on_checksum_error, 44 "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); 45 46 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 47 48 static bool bch2_target_congested(struct bch_fs *c, u16 target) 49 { 50 const struct bch_devs_mask *devs; 51 unsigned d, nr = 0, total = 0; 52 u64 now = local_clock(), last; 53 s64 congested; 54 struct bch_dev *ca; 55 56 if (!target) 57 return false; 58 59 rcu_read_lock(); 60 devs = bch2_target_to_mask(c, target) ?: 61 &c->rw_devs[BCH_DATA_user]; 62 63 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 64 ca = rcu_dereference(c->devs[d]); 65 if (!ca) 66 continue; 67 68 congested = atomic_read(&ca->congested); 69 last = READ_ONCE(ca->congested_last); 70 if (time_after64(now, last)) 71 congested -= (now - last) >> 12; 72 73 total += max(congested, 0LL); 74 nr++; 75 } 76 rcu_read_unlock(); 77 78 return get_random_u32_below(nr * CONGESTED_MAX) < total; 79 } 80 81 #else 82 83 static bool bch2_target_congested(struct bch_fs *c, u16 target) 84 { 85 return false; 86 } 87 88 #endif 89 90 /* Cache promotion on read */ 91 92 static const struct rhashtable_params bch_promote_params = { 93 .head_offset = offsetof(struct promote_op, hash), 94 .key_offset = offsetof(struct promote_op, pos), 95 .key_len = sizeof(struct bpos), 96 .automatic_shrinking = true, 97 }; 98 99 static inline bool have_io_error(struct bch_io_failures *failed) 100 { 101 return failed && failed->nr; 102 } 103 104 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 105 { 106 EBUG_ON(rbio->split); 107 108 return rbio->data_update 109 ? container_of(rbio, struct data_update, rbio) 110 : NULL; 111 } 112 113 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 114 { 115 struct data_update *u = rbio_data_update(orig); 116 if (!u) 117 return false; 118 119 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 120 unsigned i = 0; 121 bkey_for_each_ptr(ptrs, ptr) { 122 if (ptr->dev == dev && 123 u->data_opts.rewrite_ptrs & BIT(i)) 124 return true; 125 i++; 126 } 127 128 return false; 129 } 130 131 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 132 struct bpos pos, 133 struct bch_io_opts opts, 134 unsigned flags, 135 struct bch_io_failures *failed) 136 { 137 if (!have_io_error(failed)) { 138 BUG_ON(!opts.promote_target); 139 140 if (!(flags & BCH_READ_may_promote)) 141 return -BCH_ERR_nopromote_may_not; 142 143 if (bch2_bkey_has_target(c, k, opts.promote_target)) 144 return -BCH_ERR_nopromote_already_promoted; 145 146 if (bkey_extent_is_unwritten(k)) 147 return -BCH_ERR_nopromote_unwritten; 148 149 if (bch2_target_congested(c, opts.promote_target)) 150 return -BCH_ERR_nopromote_congested; 151 } 152 153 if (rhashtable_lookup_fast(&c->promote_table, &pos, 154 bch_promote_params)) 155 return -BCH_ERR_nopromote_in_flight; 156 157 return 0; 158 } 159 160 static noinline void promote_free(struct bch_read_bio *rbio) 161 { 162 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 163 struct bch_fs *c = rbio->c; 164 165 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 166 bch_promote_params); 167 BUG_ON(ret); 168 169 async_object_list_del(c, promote, op->list_idx); 170 171 bch2_data_update_exit(&op->write); 172 173 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 174 kfree_rcu(op, rcu); 175 } 176 177 static void promote_done(struct bch_write_op *wop) 178 { 179 struct promote_op *op = container_of(wop, struct promote_op, write.op); 180 struct bch_fs *c = op->write.rbio.c; 181 182 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 183 promote_free(&op->write.rbio); 184 } 185 186 static void promote_start_work(struct work_struct *work) 187 { 188 struct promote_op *op = container_of(work, struct promote_op, work); 189 190 bch2_data_update_read_done(&op->write); 191 } 192 193 static noinline void promote_start(struct bch_read_bio *rbio) 194 { 195 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 196 197 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 198 199 INIT_WORK(&op->work, promote_start_work); 200 queue_work(rbio->c->write_ref_wq, &op->work); 201 } 202 203 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 204 enum btree_id btree_id, 205 struct bkey_s_c k, 206 struct bpos pos, 207 struct extent_ptr_decoded *pick, 208 unsigned sectors, 209 struct bch_read_bio *orig, 210 struct bch_io_failures *failed) 211 { 212 struct bch_fs *c = trans->c; 213 int ret; 214 215 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 216 217 if (!have_io_error(failed)) { 218 update_opts.target = orig->opts.promote_target; 219 update_opts.extra_replicas = 1; 220 update_opts.write_flags |= BCH_WRITE_cached; 221 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 222 } else { 223 update_opts.target = orig->opts.foreground_target; 224 225 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 226 unsigned ptr_bit = 1; 227 bkey_for_each_ptr(ptrs, ptr) { 228 if (bch2_dev_io_failures(failed, ptr->dev) && 229 !ptr_being_rewritten(orig, ptr->dev)) 230 update_opts.rewrite_ptrs |= ptr_bit; 231 ptr_bit <<= 1; 232 } 233 234 if (!update_opts.rewrite_ptrs) 235 return NULL; 236 } 237 238 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) 239 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 240 241 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 242 if (!op) { 243 ret = -BCH_ERR_nopromote_enomem; 244 goto err_put; 245 } 246 247 op->start_time = local_clock(); 248 op->pos = pos; 249 250 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 251 bch_promote_params)) { 252 ret = -BCH_ERR_nopromote_in_flight; 253 goto err; 254 } 255 256 ret = async_object_list_add(c, promote, op, &op->list_idx); 257 if (ret < 0) 258 goto err_remove_hash; 259 260 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 261 writepoint_hashed((unsigned long) current), 262 &orig->opts, 263 update_opts, 264 btree_id, k); 265 op->write.type = BCH_DATA_UPDATE_promote; 266 /* 267 * possible errors: -BCH_ERR_nocow_lock_blocked, 268 * -BCH_ERR_ENOSPC_disk_reservation: 269 */ 270 if (ret) 271 goto err_remove_list; 272 273 rbio_init_fragment(&op->write.rbio.bio, orig); 274 op->write.rbio.bounce = true; 275 op->write.rbio.promote = true; 276 op->write.op.end_io = promote_done; 277 278 return &op->write.rbio; 279 err_remove_list: 280 async_object_list_del(c, promote, op->list_idx); 281 err_remove_hash: 282 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 283 bch_promote_params)); 284 err: 285 bio_free_pages(&op->write.op.wbio.bio); 286 /* We may have added to the rhashtable and thus need rcu freeing: */ 287 kfree_rcu(op, rcu); 288 err_put: 289 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 290 return ERR_PTR(ret); 291 } 292 293 noinline 294 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 295 struct bvec_iter iter, 296 struct bkey_s_c k, 297 struct extent_ptr_decoded *pick, 298 unsigned flags, 299 struct bch_read_bio *orig, 300 bool *bounce, 301 bool *read_full, 302 struct bch_io_failures *failed) 303 { 304 /* 305 * We're in the retry path, but we don't know what to repair yet, and we 306 * don't want to do a promote here: 307 */ 308 if (failed && !failed->nr) 309 return NULL; 310 311 struct bch_fs *c = trans->c; 312 /* 313 * if failed != NULL we're not actually doing a promote, we're 314 * recovering from an io/checksum error 315 */ 316 bool promote_full = (have_io_error(failed) || 317 *read_full || 318 READ_ONCE(c->opts.promote_whole_extents)); 319 /* data might have to be decompressed in the write path: */ 320 unsigned sectors = promote_full 321 ? max(pick->crc.compressed_size, pick->crc.live_size) 322 : bvec_iter_sectors(iter); 323 struct bpos pos = promote_full 324 ? bkey_start_pos(k.k) 325 : POS(k.k->p.inode, iter.bi_sector); 326 int ret; 327 328 ret = should_promote(c, k, pos, orig->opts, flags, failed); 329 if (ret) 330 goto nopromote; 331 332 struct bch_read_bio *promote = 333 __promote_alloc(trans, 334 k.k->type == KEY_TYPE_reflink_v 335 ? BTREE_ID_reflink 336 : BTREE_ID_extents, 337 k, pos, pick, sectors, orig, failed); 338 if (!promote) 339 return NULL; 340 341 ret = PTR_ERR_OR_ZERO(promote); 342 if (ret) 343 goto nopromote; 344 345 *bounce = true; 346 *read_full = promote_full; 347 return promote; 348 nopromote: 349 trace_io_read_nopromote(c, ret); 350 return NULL; 351 } 352 353 void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) 354 { 355 if (!op->write.read_done) { 356 prt_printf(out, "parent read: %px\n", op->write.rbio.parent); 357 printbuf_indent_add(out, 2); 358 bch2_read_bio_to_text(out, op->write.rbio.parent); 359 printbuf_indent_sub(out, 2); 360 } 361 362 bch2_data_update_to_text(out, &op->write); 363 } 364 365 /* Read */ 366 367 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 368 struct bch_read_bio *rbio, struct bpos read_pos) 369 { 370 int ret = lockrestart_do(trans, 371 bch2_inum_offset_err_msg_trans(trans, out, 372 (subvol_inum) { rbio->subvol, read_pos.inode }, 373 read_pos.offset << 9)); 374 if (ret) 375 return ret; 376 377 if (rbio->data_update) 378 prt_str(out, "(internal move) "); 379 380 return 0; 381 } 382 383 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 384 struct bch_read_bio *rbio, struct bpos read_pos) 385 { 386 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 387 } 388 389 enum rbio_context { 390 RBIO_CONTEXT_NULL, 391 RBIO_CONTEXT_HIGHPRI, 392 RBIO_CONTEXT_UNBOUND, 393 }; 394 395 static inline struct bch_read_bio * 396 bch2_rbio_parent(struct bch_read_bio *rbio) 397 { 398 return rbio->split ? rbio->parent : rbio; 399 } 400 401 __always_inline 402 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 403 enum rbio_context context, 404 struct workqueue_struct *wq) 405 { 406 if (context <= rbio->context) { 407 fn(&rbio->work); 408 } else { 409 rbio->work.func = fn; 410 rbio->context = context; 411 queue_work(wq, &rbio->work); 412 } 413 } 414 415 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 416 { 417 BUG_ON(rbio->bounce && !rbio->split); 418 419 if (rbio->have_ioref) { 420 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 421 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 422 } 423 424 if (rbio->split) { 425 struct bch_read_bio *parent = rbio->parent; 426 427 if (unlikely(rbio->promote)) { 428 if (!rbio->bio.bi_status) 429 promote_start(rbio); 430 else 431 promote_free(rbio); 432 } else { 433 async_object_list_del(rbio->c, rbio, rbio->list_idx); 434 435 if (rbio->bounce) 436 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 437 438 bio_put(&rbio->bio); 439 } 440 441 rbio = parent; 442 } 443 444 return rbio; 445 } 446 447 /* 448 * Only called on a top level bch_read_bio to complete an entire read request, 449 * not a split: 450 */ 451 static void bch2_rbio_done(struct bch_read_bio *rbio) 452 { 453 if (rbio->start_time) 454 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 455 rbio->start_time); 456 bio_endio(&rbio->bio); 457 } 458 459 static void get_rbio_extent(struct btree_trans *trans, 460 struct bch_read_bio *rbio, 461 struct bkey_buf *sk) 462 { 463 struct btree_iter iter; 464 struct bkey_s_c k; 465 int ret = lockrestart_do(trans, 466 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 467 rbio->data_btree, rbio->data_pos, 0))); 468 if (ret) 469 return; 470 471 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 472 bkey_for_each_ptr(ptrs, ptr) 473 if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { 474 bch2_bkey_buf_reassemble(sk, trans->c, k); 475 break; 476 } 477 478 bch2_trans_iter_exit(trans, &iter); 479 } 480 481 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, 482 enum btree_id btree, struct bkey_s_c read_k) 483 { 484 if (!bch2_poison_extents_on_checksum_error) 485 return 0; 486 487 struct bch_fs *c = trans->c; 488 489 struct data_update *u = rbio_data_update(rbio); 490 if (u) 491 read_k = bkey_i_to_s_c(u->k.k); 492 493 u64 flags = bch2_bkey_extent_flags(read_k); 494 if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) 495 return 0; 496 497 struct btree_iter iter; 498 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), 499 BTREE_ITER_intent); 500 int ret = bkey_err(k); 501 if (ret) 502 return ret; 503 504 if (!bkey_and_val_eq(k, read_k)) 505 goto out; 506 507 struct bkey_i *new = bch2_trans_kmalloc(trans, 508 bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); 509 ret = PTR_ERR_OR_ZERO(new) ?: 510 (bkey_reassemble(new, k), 0) ?: 511 bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: 512 bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: 513 bch2_trans_commit(trans, NULL, NULL, 0); 514 515 /* 516 * Propagate key change back to data update path, in particular so it 517 * knows the extent has been poisoned and it's safe to change the 518 * checksum 519 */ 520 if (u && !ret) 521 bch2_bkey_buf_copy(&u->k, c, new); 522 out: 523 bch2_trans_iter_exit(trans, &iter); 524 return ret; 525 } 526 527 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 528 struct bch_read_bio *rbio, 529 struct bvec_iter bvec_iter, 530 struct bch_io_failures *failed, 531 unsigned flags) 532 { 533 struct data_update *u = container_of(rbio, struct data_update, rbio); 534 retry: 535 bch2_trans_begin(trans); 536 537 struct btree_iter iter; 538 struct bkey_s_c k; 539 int ret = lockrestart_do(trans, 540 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 541 u->btree_id, bkey_start_pos(&u->k.k->k), 542 0))); 543 if (ret) 544 goto err; 545 546 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 547 /* extent we wanted to read no longer exists: */ 548 rbio->ret = -BCH_ERR_data_read_key_overwritten; 549 goto err; 550 } 551 552 ret = __bch2_read_extent(trans, rbio, bvec_iter, 553 bkey_start_pos(&u->k.k->k), 554 u->btree_id, 555 bkey_i_to_s_c(u->k.k), 556 0, failed, flags, -1); 557 err: 558 bch2_trans_iter_exit(trans, &iter); 559 560 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 561 bch2_err_matches(ret, BCH_ERR_data_read_retry)) 562 goto retry; 563 564 if (ret) { 565 rbio->bio.bi_status = BLK_STS_IOERR; 566 rbio->ret = ret; 567 } 568 569 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 570 return ret; 571 } 572 573 static void bch2_rbio_retry(struct work_struct *work) 574 { 575 struct bch_read_bio *rbio = 576 container_of(work, struct bch_read_bio, work); 577 struct bch_fs *c = rbio->c; 578 struct bvec_iter iter = rbio->bvec_iter; 579 unsigned flags = rbio->flags; 580 subvol_inum inum = { 581 .subvol = rbio->subvol, 582 .inum = rbio->read_pos.inode, 583 }; 584 struct bch_io_failures failed = { .nr = 0 }; 585 586 struct btree_trans *trans = bch2_trans_get(c); 587 588 struct bkey_buf sk; 589 bch2_bkey_buf_init(&sk); 590 bkey_init(&sk.k->k); 591 592 trace_io_read_retry(&rbio->bio); 593 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 594 bvec_iter_sectors(rbio->bvec_iter)); 595 596 get_rbio_extent(trans, rbio, &sk); 597 598 if (!bkey_deleted(&sk.k->k) && 599 bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 600 bch2_mark_io_failure(&failed, &rbio->pick, 601 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 602 603 if (!rbio->split) { 604 rbio->bio.bi_status = 0; 605 rbio->ret = 0; 606 } 607 608 unsigned subvol = rbio->subvol; 609 struct bpos read_pos = rbio->read_pos; 610 611 rbio = bch2_rbio_free(rbio); 612 613 flags |= BCH_READ_in_retry; 614 flags &= ~BCH_READ_may_promote; 615 flags &= ~BCH_READ_last_fragment; 616 flags |= BCH_READ_must_clone; 617 618 int ret = rbio->data_update 619 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 620 : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); 621 622 if (ret) { 623 rbio->ret = ret; 624 rbio->bio.bi_status = BLK_STS_IOERR; 625 } 626 627 if (failed.nr || ret) { 628 struct printbuf buf = PRINTBUF; 629 bch2_log_msg_start(c, &buf); 630 631 lockrestart_do(trans, 632 bch2_inum_offset_err_msg_trans(trans, &buf, 633 (subvol_inum) { subvol, read_pos.inode }, 634 read_pos.offset << 9)); 635 if (rbio->data_update) 636 prt_str(&buf, "(internal move) "); 637 638 prt_str(&buf, "data read error, "); 639 if (!ret) 640 prt_str(&buf, "successful retry"); 641 else 642 prt_str(&buf, bch2_err_str(ret)); 643 prt_newline(&buf); 644 645 if (!bkey_deleted(&sk.k->k)) { 646 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); 647 prt_newline(&buf); 648 } 649 650 bch2_io_failures_to_text(&buf, c, &failed); 651 652 bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); 653 printbuf_exit(&buf); 654 } 655 656 bch2_rbio_done(rbio); 657 bch2_bkey_buf_exit(&sk, c); 658 bch2_trans_put(trans); 659 } 660 661 static void bch2_rbio_error(struct bch_read_bio *rbio, 662 int ret, blk_status_t blk_error) 663 { 664 BUG_ON(ret >= 0); 665 666 rbio->ret = ret; 667 rbio->bio.bi_status = blk_error; 668 669 bch2_rbio_parent(rbio)->saw_error = true; 670 671 if (rbio->flags & BCH_READ_in_retry) 672 return; 673 674 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 675 bch2_rbio_punt(rbio, bch2_rbio_retry, 676 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 677 } else { 678 rbio = bch2_rbio_free(rbio); 679 680 rbio->ret = ret; 681 rbio->bio.bi_status = blk_error; 682 683 bch2_rbio_done(rbio); 684 } 685 } 686 687 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 688 struct bch_read_bio *rbio) 689 { 690 struct bch_fs *c = rbio->c; 691 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 692 struct bch_extent_crc_unpacked new_crc; 693 struct btree_iter iter; 694 struct bkey_i *new; 695 struct bkey_s_c k; 696 int ret = 0; 697 698 if (crc_is_compressed(rbio->pick.crc)) 699 return 0; 700 701 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 702 BTREE_ITER_slots|BTREE_ITER_intent); 703 if ((ret = bkey_err(k))) 704 goto out; 705 706 if (bversion_cmp(k.k->bversion, rbio->version) || 707 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 708 goto out; 709 710 /* Extent was merged? */ 711 if (bkey_start_offset(k.k) < data_offset || 712 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 713 goto out; 714 715 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 716 rbio->pick.crc, NULL, &new_crc, 717 bkey_start_offset(k.k) - data_offset, k.k->size, 718 rbio->pick.crc.csum_type)) { 719 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 720 ret = 0; 721 goto out; 722 } 723 724 /* 725 * going to be temporarily appending another checksum entry: 726 */ 727 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 728 sizeof(struct bch_extent_crc128)); 729 if ((ret = PTR_ERR_OR_ZERO(new))) 730 goto out; 731 732 bkey_reassemble(new, k); 733 734 if (!bch2_bkey_narrow_crcs(new, new_crc)) 735 goto out; 736 737 ret = bch2_trans_update(trans, &iter, new, 738 BTREE_UPDATE_internal_snapshot_node); 739 out: 740 bch2_trans_iter_exit(trans, &iter); 741 return ret; 742 } 743 744 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 745 { 746 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 747 __bch2_rbio_narrow_crcs(trans, rbio)); 748 } 749 750 static void bch2_read_decompress_err(struct work_struct *work) 751 { 752 struct bch_read_bio *rbio = 753 container_of(work, struct bch_read_bio, work); 754 struct bch_fs *c = rbio->c; 755 struct printbuf buf = PRINTBUF; 756 757 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 758 prt_str(&buf, "decompression error"); 759 760 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 761 if (ca) 762 bch_err_ratelimited(ca, "%s", buf.buf); 763 else 764 bch_err_ratelimited(c, "%s", buf.buf); 765 766 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 767 printbuf_exit(&buf); 768 } 769 770 static void bch2_read_decrypt_err(struct work_struct *work) 771 { 772 struct bch_read_bio *rbio = 773 container_of(work, struct bch_read_bio, work); 774 struct bch_fs *c = rbio->c; 775 struct printbuf buf = PRINTBUF; 776 777 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 778 prt_str(&buf, "decrypt error"); 779 780 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 781 if (ca) 782 bch_err_ratelimited(ca, "%s", buf.buf); 783 else 784 bch_err_ratelimited(c, "%s", buf.buf); 785 786 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 787 printbuf_exit(&buf); 788 } 789 790 /* Inner part that may run in process context */ 791 static void __bch2_read_endio(struct work_struct *work) 792 { 793 struct bch_read_bio *rbio = 794 container_of(work, struct bch_read_bio, work); 795 struct bch_fs *c = rbio->c; 796 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 797 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 798 struct bio *src = &rbio->bio; 799 struct bio *dst = &parent->bio; 800 struct bvec_iter dst_iter = rbio->bvec_iter; 801 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 802 struct nonce nonce = extent_nonce(rbio->version, crc); 803 unsigned nofs_flags; 804 struct bch_csum csum; 805 int ret; 806 807 nofs_flags = memalloc_nofs_save(); 808 809 /* Reset iterator for checksumming and copying bounced data: */ 810 if (rbio->bounce) { 811 src->bi_iter.bi_size = crc.compressed_size << 9; 812 src->bi_iter.bi_idx = 0; 813 src->bi_iter.bi_bvec_done = 0; 814 } else { 815 src->bi_iter = rbio->bvec_iter; 816 } 817 818 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 819 820 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 821 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 822 823 /* 824 * Checksum error: if the bio wasn't bounced, we may have been 825 * reading into buffers owned by userspace (that userspace can 826 * scribble over) - retry the read, bouncing it this time: 827 */ 828 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 829 rbio->flags |= BCH_READ_must_bounce; 830 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 831 BLK_STS_IOERR); 832 goto out; 833 } 834 835 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 836 837 if (!csum_good) 838 goto csum_err; 839 840 /* 841 * XXX 842 * We need to rework the narrow_crcs path to deliver the read completion 843 * first, and then punt to a different workqueue, otherwise we're 844 * holding up reads while doing btree updates which is bad for memory 845 * reclaim. 846 */ 847 if (unlikely(rbio->narrow_crcs)) 848 bch2_rbio_narrow_crcs(rbio); 849 850 if (likely(!parent->data_update)) { 851 /* Adjust crc to point to subset of data we want: */ 852 crc.offset += rbio->offset_into_extent; 853 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 854 855 if (crc_is_compressed(crc)) { 856 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 857 if (ret) 858 goto decrypt_err; 859 860 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 861 !c->opts.no_data_io) 862 goto decompression_err; 863 } else { 864 /* don't need to decrypt the entire bio: */ 865 nonce = nonce_add(nonce, crc.offset << 9); 866 bio_advance(src, crc.offset << 9); 867 868 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 869 src->bi_iter.bi_size = dst_iter.bi_size; 870 871 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 872 if (ret) 873 goto decrypt_err; 874 875 if (rbio->bounce) { 876 struct bvec_iter src_iter = src->bi_iter; 877 878 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 879 } 880 } 881 } else { 882 if (rbio->split) 883 rbio->parent->pick = rbio->pick; 884 885 if (rbio->bounce) { 886 struct bvec_iter src_iter = src->bi_iter; 887 888 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 889 } 890 } 891 892 if (rbio->promote) { 893 /* 894 * Re encrypt data we decrypted, so it's consistent with 895 * rbio->crc: 896 */ 897 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 898 if (ret) 899 goto decrypt_err; 900 } 901 902 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 903 rbio = bch2_rbio_free(rbio); 904 bch2_rbio_done(rbio); 905 } 906 out: 907 memalloc_nofs_restore(nofs_flags); 908 return; 909 csum_err: 910 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 911 goto out; 912 decompression_err: 913 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 914 goto out; 915 decrypt_err: 916 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 917 goto out; 918 } 919 920 static void bch2_read_endio(struct bio *bio) 921 { 922 struct bch_read_bio *rbio = 923 container_of(bio, struct bch_read_bio, bio); 924 struct bch_fs *c = rbio->c; 925 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 926 struct workqueue_struct *wq = NULL; 927 enum rbio_context context = RBIO_CONTEXT_NULL; 928 929 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 930 rbio->submit_time, !bio->bi_status); 931 932 if (!rbio->split) 933 rbio->bio.bi_end_io = rbio->end_io; 934 935 if (unlikely(bio->bi_status)) { 936 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 937 return; 938 } 939 940 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 941 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 942 trace_and_count(c, io_read_reuse_race, &rbio->bio); 943 944 if (rbio->flags & BCH_READ_retry_if_stale) 945 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 946 else 947 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 948 return; 949 } 950 951 if (rbio->narrow_crcs || 952 rbio->promote || 953 crc_is_compressed(rbio->pick.crc) || 954 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 955 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 956 else if (rbio->pick.crc.csum_type) 957 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 958 959 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 960 } 961 962 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 963 struct bch_dev *ca, 964 struct bkey_s_c k, 965 struct bch_extent_ptr ptr) 966 { 967 struct bch_fs *c = trans->c; 968 struct btree_iter iter; 969 struct printbuf buf = PRINTBUF; 970 int ret; 971 972 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 973 PTR_BUCKET_POS(ca, &ptr), 974 BTREE_ITER_cached); 975 976 int gen = bucket_gen_get(ca, iter.pos.offset); 977 if (gen >= 0) { 978 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 979 printbuf_indent_add(&buf, 2); 980 981 bch2_bkey_val_to_text(&buf, c, k); 982 prt_newline(&buf); 983 984 prt_printf(&buf, "memory gen: %u", gen); 985 986 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 987 if (!ret) { 988 prt_newline(&buf); 989 bch2_bkey_val_to_text(&buf, c, k); 990 } 991 } else { 992 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 993 iter.pos.inode, iter.pos.offset); 994 printbuf_indent_add(&buf, 2); 995 996 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 997 ca->mi.first_bucket, ca->mi.nbuckets); 998 999 bch2_bkey_val_to_text(&buf, c, k); 1000 prt_newline(&buf); 1001 } 1002 1003 bch2_fs_inconsistent(c, "%s", buf.buf); 1004 1005 bch2_trans_iter_exit(trans, &iter); 1006 printbuf_exit(&buf); 1007 } 1008 1009 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 1010 struct bvec_iter iter, struct bpos read_pos, 1011 enum btree_id data_btree, struct bkey_s_c k, 1012 unsigned offset_into_extent, 1013 struct bch_io_failures *failed, unsigned flags, int dev) 1014 { 1015 struct bch_fs *c = trans->c; 1016 struct extent_ptr_decoded pick; 1017 struct bch_read_bio *rbio = NULL; 1018 bool bounce = false, read_full = false, narrow_crcs = false; 1019 struct bpos data_pos = bkey_start_pos(k.k); 1020 struct data_update *u = rbio_data_update(orig); 1021 int ret = 0; 1022 1023 if (bkey_extent_is_inline_data(k.k)) { 1024 unsigned bytes = min_t(unsigned, iter.bi_size, 1025 bkey_inline_data_bytes(k.k)); 1026 1027 swap(iter.bi_size, bytes); 1028 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 1029 swap(iter.bi_size, bytes); 1030 bio_advance_iter(&orig->bio, &iter, bytes); 1031 zero_fill_bio_iter(&orig->bio, iter); 1032 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 1033 bvec_iter_sectors(iter)); 1034 goto out_read_done; 1035 } 1036 1037 if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && 1038 !orig->data_update) 1039 return -BCH_ERR_extent_poisoned; 1040 retry_pick: 1041 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 1042 1043 /* hole or reservation - just zero fill: */ 1044 if (!ret) 1045 goto hole; 1046 1047 if (unlikely(ret < 0)) { 1048 if (ret == -BCH_ERR_data_read_csum_err) { 1049 int ret2 = maybe_poison_extent(trans, orig, data_btree, k); 1050 if (ret2) { 1051 ret = ret2; 1052 goto err; 1053 } 1054 1055 trace_and_count(c, io_read_fail_and_poison, &orig->bio); 1056 } 1057 1058 struct printbuf buf = PRINTBUF; 1059 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1060 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 1061 bch2_bkey_val_to_text(&buf, c, k); 1062 1063 bch_err_ratelimited(c, "%s", buf.buf); 1064 printbuf_exit(&buf); 1065 goto err; 1066 } 1067 1068 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && 1069 !c->chacha20_key_set) { 1070 struct printbuf buf = PRINTBUF; 1071 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1072 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 1073 bch2_bkey_val_to_text(&buf, c, k); 1074 1075 bch_err_ratelimited(c, "%s", buf.buf); 1076 printbuf_exit(&buf); 1077 ret = -BCH_ERR_data_read_no_encryption_key; 1078 goto err; 1079 } 1080 1081 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, 1082 BCH_DEV_READ_REF_io_read); 1083 1084 /* 1085 * Stale dirty pointers are treated as IO errors, but @failed isn't 1086 * allocated unless we're in the retry path - so if we're not in the 1087 * retry path, don't check here, it'll be caught in bch2_read_endio() 1088 * and we'll end up in the retry path: 1089 */ 1090 if ((flags & BCH_READ_in_retry) && 1091 !pick.ptr.cached && 1092 ca && 1093 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1094 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1095 bch2_mark_io_failure(failed, &pick, false); 1096 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 1097 goto retry_pick; 1098 } 1099 1100 if (likely(!u)) { 1101 if (!(flags & BCH_READ_last_fragment) || 1102 bio_flagged(&orig->bio, BIO_CHAIN)) 1103 flags |= BCH_READ_must_clone; 1104 1105 narrow_crcs = !(flags & BCH_READ_in_retry) && 1106 bch2_can_narrow_extent_crcs(k, pick.crc); 1107 1108 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1109 flags |= BCH_READ_must_bounce; 1110 1111 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1112 1113 if (crc_is_compressed(pick.crc) || 1114 (pick.crc.csum_type != BCH_CSUM_none && 1115 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1116 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1117 (flags & BCH_READ_user_mapped)) || 1118 (flags & BCH_READ_must_bounce)))) { 1119 read_full = true; 1120 bounce = true; 1121 } 1122 } else { 1123 /* 1124 * can happen if we retry, and the extent we were going to read 1125 * has been merged in the meantime: 1126 */ 1127 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1128 if (ca) 1129 enumerated_ref_put(&ca->io_ref[READ], 1130 BCH_DEV_READ_REF_io_read); 1131 rbio->ret = -BCH_ERR_data_read_buffer_too_small; 1132 goto out_read_done; 1133 } 1134 1135 iter.bi_size = pick.crc.compressed_size << 9; 1136 read_full = true; 1137 } 1138 1139 if (orig->opts.promote_target || have_io_error(failed)) 1140 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1141 &bounce, &read_full, failed); 1142 1143 if (!read_full) { 1144 EBUG_ON(crc_is_compressed(pick.crc)); 1145 EBUG_ON(pick.crc.csum_type && 1146 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1147 bvec_iter_sectors(iter) != pick.crc.live_size || 1148 pick.crc.offset || 1149 offset_into_extent)); 1150 1151 data_pos.offset += offset_into_extent; 1152 pick.ptr.offset += pick.crc.offset + 1153 offset_into_extent; 1154 offset_into_extent = 0; 1155 pick.crc.compressed_size = bvec_iter_sectors(iter); 1156 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1157 pick.crc.offset = 0; 1158 pick.crc.live_size = bvec_iter_sectors(iter); 1159 } 1160 1161 if (rbio) { 1162 /* 1163 * promote already allocated bounce rbio: 1164 * promote needs to allocate a bio big enough for uncompressing 1165 * data in the write path, but we're not going to use it all 1166 * here: 1167 */ 1168 EBUG_ON(rbio->bio.bi_iter.bi_size < 1169 pick.crc.compressed_size << 9); 1170 rbio->bio.bi_iter.bi_size = 1171 pick.crc.compressed_size << 9; 1172 } else if (bounce) { 1173 unsigned sectors = pick.crc.compressed_size; 1174 1175 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1176 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1177 0, 1178 GFP_NOFS, 1179 &c->bio_read_split), 1180 orig); 1181 1182 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1183 rbio->bounce = true; 1184 } else if (flags & BCH_READ_must_clone) { 1185 /* 1186 * Have to clone if there were any splits, due to error 1187 * reporting issues (if a split errored, and retrying didn't 1188 * work, when it reports the error to its parent (us) we don't 1189 * know if the error was from our bio, and we should retry, or 1190 * from the whole bio, in which case we don't want to retry and 1191 * lose the error) 1192 */ 1193 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1194 &c->bio_read_split), 1195 orig); 1196 rbio->bio.bi_iter = iter; 1197 } else { 1198 rbio = orig; 1199 rbio->bio.bi_iter = iter; 1200 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1201 } 1202 1203 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1204 1205 rbio->submit_time = local_clock(); 1206 if (!rbio->split) 1207 rbio->end_io = orig->bio.bi_end_io; 1208 rbio->bvec_iter = iter; 1209 rbio->offset_into_extent= offset_into_extent; 1210 rbio->flags = flags; 1211 rbio->have_ioref = ca != NULL; 1212 rbio->narrow_crcs = narrow_crcs; 1213 rbio->ret = 0; 1214 rbio->context = 0; 1215 rbio->pick = pick; 1216 rbio->subvol = orig->subvol; 1217 rbio->read_pos = read_pos; 1218 rbio->data_btree = data_btree; 1219 rbio->data_pos = data_pos; 1220 rbio->version = k.k->bversion; 1221 INIT_WORK(&rbio->work, NULL); 1222 1223 rbio->bio.bi_opf = orig->bio.bi_opf; 1224 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1225 rbio->bio.bi_end_io = bch2_read_endio; 1226 1227 async_object_list_add(c, rbio, rbio, &rbio->list_idx); 1228 1229 if (rbio->bounce) 1230 trace_and_count(c, io_read_bounce, &rbio->bio); 1231 1232 if (!u) 1233 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1234 else 1235 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1236 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1237 1238 /* 1239 * If it's being moved internally, we don't want to flag it as a cache 1240 * hit: 1241 */ 1242 if (ca && pick.ptr.cached && !u) 1243 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1244 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1245 1246 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1247 bio_inc_remaining(&orig->bio); 1248 trace_and_count(c, io_read_split, &orig->bio); 1249 } 1250 1251 /* 1252 * Unlock the iterator while the btree node's lock is still in 1253 * cache, before doing the IO: 1254 */ 1255 if (!(flags & BCH_READ_in_retry)) 1256 bch2_trans_unlock(trans); 1257 else 1258 bch2_trans_unlock_long(trans); 1259 1260 if (likely(!rbio->pick.do_ec_reconstruct)) { 1261 if (unlikely(!rbio->have_ioref)) { 1262 bch2_rbio_error(rbio, 1263 -BCH_ERR_data_read_retry_device_offline, 1264 BLK_STS_IOERR); 1265 goto out; 1266 } 1267 1268 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1269 bio_sectors(&rbio->bio)); 1270 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1271 1272 if (unlikely(c->opts.no_data_io)) { 1273 if (likely(!(flags & BCH_READ_in_retry))) 1274 bio_endio(&rbio->bio); 1275 } else { 1276 if (likely(!(flags & BCH_READ_in_retry))) 1277 submit_bio(&rbio->bio); 1278 else 1279 submit_bio_wait(&rbio->bio); 1280 } 1281 1282 /* 1283 * We just submitted IO which may block, we expect relock fail 1284 * events and shouldn't count them: 1285 */ 1286 trans->notrace_relock_fail = true; 1287 } else { 1288 /* Attempting reconstruct read: */ 1289 if (bch2_ec_read_extent(trans, rbio, k)) { 1290 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1291 BLK_STS_IOERR); 1292 goto out; 1293 } 1294 1295 if (likely(!(flags & BCH_READ_in_retry))) 1296 bio_endio(&rbio->bio); 1297 } 1298 out: 1299 if (likely(!(flags & BCH_READ_in_retry))) { 1300 return 0; 1301 } else { 1302 bch2_trans_unlock(trans); 1303 1304 int ret; 1305 1306 rbio->context = RBIO_CONTEXT_UNBOUND; 1307 bch2_read_endio(&rbio->bio); 1308 1309 ret = rbio->ret; 1310 rbio = bch2_rbio_free(rbio); 1311 1312 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1313 bch2_mark_io_failure(failed, &pick, 1314 ret == -BCH_ERR_data_read_retry_csum_err); 1315 1316 return ret; 1317 } 1318 1319 err: 1320 if (flags & BCH_READ_in_retry) 1321 return ret; 1322 1323 orig->bio.bi_status = BLK_STS_IOERR; 1324 orig->ret = ret; 1325 goto out_read_done; 1326 1327 hole: 1328 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1329 bvec_iter_sectors(iter)); 1330 /* 1331 * won't normally happen in the data update (bch2_move_extent()) path, 1332 * but if we retry and the extent we wanted to read no longer exists we 1333 * have to signal that: 1334 */ 1335 if (u) 1336 orig->ret = -BCH_ERR_data_read_key_overwritten; 1337 1338 zero_fill_bio_iter(&orig->bio, iter); 1339 out_read_done: 1340 if ((flags & BCH_READ_last_fragment) && 1341 !(flags & BCH_READ_in_retry)) 1342 bch2_rbio_done(orig); 1343 return 0; 1344 } 1345 1346 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1347 struct bvec_iter bvec_iter, subvol_inum inum, 1348 struct bch_io_failures *failed, 1349 struct bkey_buf *prev_read, 1350 unsigned flags) 1351 { 1352 struct bch_fs *c = trans->c; 1353 struct btree_iter iter; 1354 struct bkey_buf sk; 1355 struct bkey_s_c k; 1356 enum btree_id data_btree; 1357 int ret; 1358 1359 EBUG_ON(rbio->data_update); 1360 1361 bch2_bkey_buf_init(&sk); 1362 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1363 POS(inum.inum, bvec_iter.bi_sector), 1364 BTREE_ITER_slots); 1365 1366 while (1) { 1367 data_btree = BTREE_ID_extents; 1368 1369 bch2_trans_begin(trans); 1370 1371 u32 snapshot; 1372 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1373 if (ret) 1374 goto err; 1375 1376 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1377 1378 bch2_btree_iter_set_pos(trans, &iter, 1379 POS(inum.inum, bvec_iter.bi_sector)); 1380 1381 k = bch2_btree_iter_peek_slot(trans, &iter); 1382 ret = bkey_err(k); 1383 if (ret) 1384 goto err; 1385 1386 s64 offset_into_extent = iter.pos.offset - 1387 bkey_start_offset(k.k); 1388 unsigned sectors = k.k->size - offset_into_extent; 1389 1390 bch2_bkey_buf_reassemble(&sk, c, k); 1391 1392 ret = bch2_read_indirect_extent(trans, &data_btree, 1393 &offset_into_extent, &sk); 1394 if (ret) 1395 goto err; 1396 1397 k = bkey_i_to_s_c(sk.k); 1398 1399 if (unlikely(flags & BCH_READ_in_retry)) { 1400 if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) 1401 failed->nr = 0; 1402 bch2_bkey_buf_copy(prev_read, c, sk.k); 1403 } 1404 1405 /* 1406 * With indirect extents, the amount of data to read is the min 1407 * of the original extent and the indirect extent: 1408 */ 1409 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1410 1411 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1412 swap(bvec_iter.bi_size, bytes); 1413 1414 if (bvec_iter.bi_size == bytes) 1415 flags |= BCH_READ_last_fragment; 1416 1417 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1418 data_btree, k, 1419 offset_into_extent, failed, flags, -1); 1420 swap(bvec_iter.bi_size, bytes); 1421 1422 if (ret) 1423 goto err; 1424 1425 if (flags & BCH_READ_last_fragment) 1426 break; 1427 1428 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1429 err: 1430 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1431 flags |= BCH_READ_must_bounce; 1432 1433 if (ret && 1434 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1435 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1436 break; 1437 } 1438 1439 if (unlikely(ret)) { 1440 if (ret != -BCH_ERR_extent_poisoned) { 1441 struct printbuf buf = PRINTBUF; 1442 lockrestart_do(trans, 1443 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1444 bvec_iter.bi_sector << 9)); 1445 prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); 1446 bch_err_ratelimited(c, "%s", buf.buf); 1447 printbuf_exit(&buf); 1448 } 1449 1450 rbio->bio.bi_status = BLK_STS_IOERR; 1451 rbio->ret = ret; 1452 1453 if (!(flags & BCH_READ_in_retry)) 1454 bch2_rbio_done(rbio); 1455 } 1456 1457 bch2_trans_iter_exit(trans, &iter); 1458 bch2_bkey_buf_exit(&sk, c); 1459 return ret; 1460 } 1461 1462 static const char * const bch2_read_bio_flags[] = { 1463 #define x(n) #n, 1464 BCH_READ_FLAGS() 1465 #undef x 1466 NULL 1467 }; 1468 1469 void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) 1470 { 1471 u64 now = local_clock(); 1472 prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); 1473 prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); 1474 1475 if (!rbio->split) 1476 prt_printf(out, "end_io:\t%ps\n", rbio->end_io); 1477 else 1478 prt_printf(out, "parent:\t%px\n", rbio->parent); 1479 1480 prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); 1481 1482 prt_printf(out, "promote:\t%u\n", rbio->promote); 1483 prt_printf(out, "bounce:\t%u\n", rbio->bounce); 1484 prt_printf(out, "split:\t%u\n", rbio->split); 1485 prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); 1486 prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); 1487 prt_printf(out, "context:\t%u\n", rbio->context); 1488 prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); 1489 1490 prt_printf(out, "flags:\t"); 1491 bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); 1492 prt_newline(out); 1493 1494 bch2_bio_to_text(out, &rbio->bio); 1495 } 1496 1497 void bch2_fs_io_read_exit(struct bch_fs *c) 1498 { 1499 if (c->promote_table.tbl) 1500 rhashtable_destroy(&c->promote_table); 1501 bioset_exit(&c->bio_read_split); 1502 bioset_exit(&c->bio_read); 1503 mempool_exit(&c->bio_bounce_pages); 1504 } 1505 1506 int bch2_fs_io_read_init(struct bch_fs *c) 1507 { 1508 if (mempool_init_page_pool(&c->bio_bounce_pages, 1509 max_t(unsigned, 1510 c->opts.btree_node_size, 1511 c->opts.encoded_extent_max) / 1512 PAGE_SIZE, 0)) 1513 return -BCH_ERR_ENOMEM_bio_bounce_pages_init; 1514 1515 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1516 BIOSET_NEED_BVECS)) 1517 return -BCH_ERR_ENOMEM_bio_read_init; 1518 1519 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1520 BIOSET_NEED_BVECS)) 1521 return -BCH_ERR_ENOMEM_bio_read_split_init; 1522 1523 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1524 return -BCH_ERR_ENOMEM_promote_table_init; 1525 1526 return 0; 1527 } 1528