1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Some low level IO code, and hacks for various block layer limitations 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcachefs.h" 10 #include "alloc_background.h" 11 #include "alloc_foreground.h" 12 #include "async_objs.h" 13 #include "btree_update.h" 14 #include "buckets.h" 15 #include "checksum.h" 16 #include "clock.h" 17 #include "compress.h" 18 #include "data_update.h" 19 #include "disk_groups.h" 20 #include "ec.h" 21 #include "enumerated_ref.h" 22 #include "error.h" 23 #include "io_read.h" 24 #include "io_misc.h" 25 #include "io_write.h" 26 #include "reflink.h" 27 #include "subvolume.h" 28 #include "trace.h" 29 30 #include <linux/moduleparam.h> 31 #include <linux/random.h> 32 #include <linux/sched/mm.h> 33 34 #ifdef CONFIG_BCACHEFS_DEBUG 35 static unsigned bch2_read_corrupt_ratio; 36 module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); 37 MODULE_PARM_DESC(read_corrupt_ratio, ""); 38 #endif 39 40 static bool bch2_poison_extents_on_checksum_error; 41 module_param_named(poison_extents_on_checksum_error, 42 bch2_poison_extents_on_checksum_error, bool, 0644); 43 MODULE_PARM_DESC(poison_extents_on_checksum_error, 44 "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); 45 46 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 47 48 static bool bch2_target_congested(struct bch_fs *c, u16 target) 49 { 50 const struct bch_devs_mask *devs; 51 unsigned d, nr = 0, total = 0; 52 u64 now = local_clock(), last; 53 s64 congested; 54 struct bch_dev *ca; 55 56 if (!target) 57 return false; 58 59 guard(rcu)(); 60 devs = bch2_target_to_mask(c, target) ?: 61 &c->rw_devs[BCH_DATA_user]; 62 63 for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { 64 ca = rcu_dereference(c->devs[d]); 65 if (!ca) 66 continue; 67 68 congested = atomic_read(&ca->congested); 69 last = READ_ONCE(ca->congested_last); 70 if (time_after64(now, last)) 71 congested -= (now - last) >> 12; 72 73 total += max(congested, 0LL); 74 nr++; 75 } 76 77 return get_random_u32_below(nr * CONGESTED_MAX) < total; 78 } 79 80 #else 81 82 static bool bch2_target_congested(struct bch_fs *c, u16 target) 83 { 84 return false; 85 } 86 87 #endif 88 89 /* Cache promotion on read */ 90 91 static const struct rhashtable_params bch_promote_params = { 92 .head_offset = offsetof(struct promote_op, hash), 93 .key_offset = offsetof(struct promote_op, pos), 94 .key_len = sizeof(struct bpos), 95 .automatic_shrinking = true, 96 }; 97 98 static inline bool have_io_error(struct bch_io_failures *failed) 99 { 100 return failed && failed->nr; 101 } 102 103 static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) 104 { 105 EBUG_ON(rbio->split); 106 107 return rbio->data_update 108 ? container_of(rbio, struct data_update, rbio) 109 : NULL; 110 } 111 112 static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) 113 { 114 struct data_update *u = rbio_data_update(orig); 115 if (!u) 116 return false; 117 118 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); 119 unsigned i = 0; 120 bkey_for_each_ptr(ptrs, ptr) { 121 if (ptr->dev == dev && 122 u->data_opts.rewrite_ptrs & BIT(i)) 123 return true; 124 i++; 125 } 126 127 return false; 128 } 129 130 static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, 131 struct bpos pos, 132 struct bch_io_opts opts, 133 unsigned flags, 134 struct bch_io_failures *failed) 135 { 136 if (!have_io_error(failed)) { 137 BUG_ON(!opts.promote_target); 138 139 if (!(flags & BCH_READ_may_promote)) 140 return bch_err_throw(c, nopromote_may_not); 141 142 if (bch2_bkey_has_target(c, k, opts.promote_target)) 143 return bch_err_throw(c, nopromote_already_promoted); 144 145 if (bkey_extent_is_unwritten(k)) 146 return bch_err_throw(c, nopromote_unwritten); 147 148 if (bch2_target_congested(c, opts.promote_target)) 149 return bch_err_throw(c, nopromote_congested); 150 } 151 152 if (rhashtable_lookup_fast(&c->promote_table, &pos, 153 bch_promote_params)) 154 return bch_err_throw(c, nopromote_in_flight); 155 156 return 0; 157 } 158 159 static noinline void promote_free(struct bch_read_bio *rbio) 160 { 161 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 162 struct bch_fs *c = rbio->c; 163 164 int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, 165 bch_promote_params); 166 BUG_ON(ret); 167 168 async_object_list_del(c, promote, op->list_idx); 169 170 bch2_data_update_exit(&op->write); 171 172 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 173 kfree_rcu(op, rcu); 174 } 175 176 static void promote_done(struct bch_write_op *wop) 177 { 178 struct promote_op *op = container_of(wop, struct promote_op, write.op); 179 struct bch_fs *c = op->write.rbio.c; 180 181 bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); 182 promote_free(&op->write.rbio); 183 } 184 185 static void promote_start_work(struct work_struct *work) 186 { 187 struct promote_op *op = container_of(work, struct promote_op, work); 188 189 bch2_data_update_read_done(&op->write); 190 } 191 192 static noinline void promote_start(struct bch_read_bio *rbio) 193 { 194 struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); 195 196 trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); 197 198 INIT_WORK(&op->work, promote_start_work); 199 queue_work(rbio->c->write_ref_wq, &op->work); 200 } 201 202 static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, 203 enum btree_id btree_id, 204 struct bkey_s_c k, 205 struct bpos pos, 206 struct extent_ptr_decoded *pick, 207 unsigned sectors, 208 struct bch_read_bio *orig, 209 struct bch_io_failures *failed) 210 { 211 struct bch_fs *c = trans->c; 212 int ret; 213 214 struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; 215 216 if (!have_io_error(failed)) { 217 update_opts.target = orig->opts.promote_target; 218 update_opts.extra_replicas = 1; 219 update_opts.write_flags |= BCH_WRITE_cached; 220 update_opts.write_flags |= BCH_WRITE_only_specified_devs; 221 } else { 222 update_opts.target = orig->opts.foreground_target; 223 224 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 225 unsigned ptr_bit = 1; 226 bkey_for_each_ptr(ptrs, ptr) { 227 if (bch2_dev_io_failures(failed, ptr->dev) && 228 !ptr_being_rewritten(orig, ptr->dev)) 229 update_opts.rewrite_ptrs |= ptr_bit; 230 ptr_bit <<= 1; 231 } 232 233 if (!update_opts.rewrite_ptrs) 234 return NULL; 235 } 236 237 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) 238 return ERR_PTR(-BCH_ERR_nopromote_no_writes); 239 240 struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); 241 if (!op) { 242 ret = bch_err_throw(c, nopromote_enomem); 243 goto err_put; 244 } 245 246 op->start_time = local_clock(); 247 op->pos = pos; 248 249 if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, 250 bch_promote_params)) { 251 ret = bch_err_throw(c, nopromote_in_flight); 252 goto err; 253 } 254 255 ret = async_object_list_add(c, promote, op, &op->list_idx); 256 if (ret < 0) 257 goto err_remove_hash; 258 259 ret = bch2_data_update_init(trans, NULL, NULL, &op->write, 260 writepoint_hashed((unsigned long) current), 261 &orig->opts, 262 update_opts, 263 btree_id, k); 264 op->write.type = BCH_DATA_UPDATE_promote; 265 /* 266 * possible errors: -BCH_ERR_nocow_lock_blocked, 267 * -BCH_ERR_ENOSPC_disk_reservation: 268 */ 269 if (ret) 270 goto err_remove_list; 271 272 rbio_init_fragment(&op->write.rbio.bio, orig); 273 op->write.rbio.bounce = true; 274 op->write.rbio.promote = true; 275 op->write.op.end_io = promote_done; 276 277 return &op->write.rbio; 278 err_remove_list: 279 async_object_list_del(c, promote, op->list_idx); 280 err_remove_hash: 281 BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, 282 bch_promote_params)); 283 err: 284 bio_free_pages(&op->write.op.wbio.bio); 285 /* We may have added to the rhashtable and thus need rcu freeing: */ 286 kfree_rcu(op, rcu); 287 err_put: 288 enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); 289 return ERR_PTR(ret); 290 } 291 292 noinline 293 static struct bch_read_bio *promote_alloc(struct btree_trans *trans, 294 struct bvec_iter iter, 295 struct bkey_s_c k, 296 struct extent_ptr_decoded *pick, 297 unsigned flags, 298 struct bch_read_bio *orig, 299 bool *bounce, 300 bool *read_full, 301 struct bch_io_failures *failed) 302 { 303 /* 304 * We're in the retry path, but we don't know what to repair yet, and we 305 * don't want to do a promote here: 306 */ 307 if (failed && !failed->nr) 308 return NULL; 309 310 struct bch_fs *c = trans->c; 311 /* 312 * if failed != NULL we're not actually doing a promote, we're 313 * recovering from an io/checksum error 314 */ 315 bool promote_full = (have_io_error(failed) || 316 *read_full || 317 READ_ONCE(c->opts.promote_whole_extents)); 318 /* data might have to be decompressed in the write path: */ 319 unsigned sectors = promote_full 320 ? max(pick->crc.compressed_size, pick->crc.live_size) 321 : bvec_iter_sectors(iter); 322 struct bpos pos = promote_full 323 ? bkey_start_pos(k.k) 324 : POS(k.k->p.inode, iter.bi_sector); 325 int ret; 326 327 ret = should_promote(c, k, pos, orig->opts, flags, failed); 328 if (ret) 329 goto nopromote; 330 331 struct bch_read_bio *promote = 332 __promote_alloc(trans, 333 k.k->type == KEY_TYPE_reflink_v 334 ? BTREE_ID_reflink 335 : BTREE_ID_extents, 336 k, pos, pick, sectors, orig, failed); 337 if (!promote) 338 return NULL; 339 340 ret = PTR_ERR_OR_ZERO(promote); 341 if (ret) 342 goto nopromote; 343 344 *bounce = true; 345 *read_full = promote_full; 346 347 if (have_io_error(failed)) 348 orig->self_healing = true; 349 350 return promote; 351 nopromote: 352 trace_io_read_nopromote(c, ret); 353 return NULL; 354 } 355 356 void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) 357 { 358 if (!op->write.read_done) { 359 prt_printf(out, "parent read: %px\n", op->write.rbio.parent); 360 printbuf_indent_add(out, 2); 361 bch2_read_bio_to_text(out, op->write.rbio.parent); 362 printbuf_indent_sub(out, 2); 363 } 364 365 bch2_data_update_to_text(out, &op->write); 366 } 367 368 /* Read */ 369 370 static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, 371 struct bch_read_bio *rbio, struct bpos read_pos) 372 { 373 int ret = lockrestart_do(trans, 374 bch2_inum_offset_err_msg_trans(trans, out, 375 (subvol_inum) { rbio->subvol, read_pos.inode }, 376 read_pos.offset << 9)); 377 if (ret) 378 return ret; 379 380 if (rbio->data_update) 381 prt_str(out, "(internal move) "); 382 383 return 0; 384 } 385 386 static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, 387 struct bch_read_bio *rbio, struct bpos read_pos) 388 { 389 bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); 390 } 391 392 enum rbio_context { 393 RBIO_CONTEXT_NULL, 394 RBIO_CONTEXT_HIGHPRI, 395 RBIO_CONTEXT_UNBOUND, 396 }; 397 398 static inline struct bch_read_bio * 399 bch2_rbio_parent(struct bch_read_bio *rbio) 400 { 401 return rbio->split ? rbio->parent : rbio; 402 } 403 404 __always_inline 405 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, 406 enum rbio_context context, 407 struct workqueue_struct *wq) 408 { 409 if (context <= rbio->context) { 410 fn(&rbio->work); 411 } else { 412 rbio->work.func = fn; 413 rbio->context = context; 414 queue_work(wq, &rbio->work); 415 } 416 } 417 418 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) 419 { 420 BUG_ON(rbio->bounce && !rbio->split); 421 422 if (rbio->have_ioref) { 423 struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); 424 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 425 } 426 427 if (rbio->split) { 428 struct bch_read_bio *parent = rbio->parent; 429 430 if (unlikely(rbio->promote)) { 431 if (!rbio->bio.bi_status) 432 promote_start(rbio); 433 else 434 promote_free(rbio); 435 } else { 436 async_object_list_del(rbio->c, rbio, rbio->list_idx); 437 438 if (rbio->bounce) 439 bch2_bio_free_pages_pool(rbio->c, &rbio->bio); 440 441 bio_put(&rbio->bio); 442 } 443 444 rbio = parent; 445 } 446 447 return rbio; 448 } 449 450 /* 451 * Only called on a top level bch_read_bio to complete an entire read request, 452 * not a split: 453 */ 454 static void bch2_rbio_done(struct bch_read_bio *rbio) 455 { 456 if (rbio->start_time) 457 bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], 458 rbio->start_time); 459 bio_endio(&rbio->bio); 460 } 461 462 static void get_rbio_extent(struct btree_trans *trans, 463 struct bch_read_bio *rbio, 464 struct bkey_buf *sk) 465 { 466 struct btree_iter iter; 467 struct bkey_s_c k; 468 int ret = lockrestart_do(trans, 469 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 470 rbio->data_btree, rbio->data_pos, 0))); 471 if (ret) 472 return; 473 474 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 475 bkey_for_each_ptr(ptrs, ptr) 476 if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { 477 bch2_bkey_buf_reassemble(sk, trans->c, k); 478 break; 479 } 480 481 bch2_trans_iter_exit(trans, &iter); 482 } 483 484 static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, 485 enum btree_id btree, struct bkey_s_c read_k) 486 { 487 if (!bch2_poison_extents_on_checksum_error) 488 return 0; 489 490 struct bch_fs *c = trans->c; 491 492 struct data_update *u = rbio_data_update(rbio); 493 if (u) 494 read_k = bkey_i_to_s_c(u->k.k); 495 496 u64 flags = bch2_bkey_extent_flags(read_k); 497 if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) 498 return 0; 499 500 struct btree_iter iter; 501 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), 502 BTREE_ITER_intent); 503 int ret = bkey_err(k); 504 if (ret) 505 return ret; 506 507 if (!bkey_and_val_eq(k, read_k)) 508 goto out; 509 510 struct bkey_i *new = bch2_trans_kmalloc(trans, 511 bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); 512 ret = PTR_ERR_OR_ZERO(new) ?: 513 (bkey_reassemble(new, k), 0) ?: 514 bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: 515 bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: 516 bch2_trans_commit(trans, NULL, NULL, 0); 517 518 /* 519 * Propagate key change back to data update path, in particular so it 520 * knows the extent has been poisoned and it's safe to change the 521 * checksum 522 */ 523 if (u && !ret) 524 bch2_bkey_buf_copy(&u->k, c, new); 525 out: 526 bch2_trans_iter_exit(trans, &iter); 527 return ret; 528 } 529 530 static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, 531 struct bch_read_bio *rbio, 532 struct bvec_iter bvec_iter, 533 struct bch_io_failures *failed, 534 unsigned flags) 535 { 536 struct data_update *u = container_of(rbio, struct data_update, rbio); 537 retry: 538 bch2_trans_begin(trans); 539 540 struct btree_iter iter; 541 struct bkey_s_c k; 542 int ret = lockrestart_do(trans, 543 bkey_err(k = bch2_bkey_get_iter(trans, &iter, 544 u->btree_id, bkey_start_pos(&u->k.k->k), 545 0))); 546 if (ret) 547 goto err; 548 549 if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { 550 /* extent we wanted to read no longer exists: */ 551 rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); 552 goto err; 553 } 554 555 ret = __bch2_read_extent(trans, rbio, bvec_iter, 556 bkey_start_pos(&u->k.k->k), 557 u->btree_id, 558 bkey_i_to_s_c(u->k.k), 559 0, failed, flags, -1); 560 err: 561 bch2_trans_iter_exit(trans, &iter); 562 563 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 564 bch2_err_matches(ret, BCH_ERR_data_read_retry)) 565 goto retry; 566 567 if (ret) { 568 rbio->bio.bi_status = BLK_STS_IOERR; 569 rbio->ret = ret; 570 } 571 572 BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); 573 return ret; 574 } 575 576 static void bch2_rbio_retry(struct work_struct *work) 577 { 578 struct bch_read_bio *rbio = 579 container_of(work, struct bch_read_bio, work); 580 struct bch_fs *c = rbio->c; 581 struct bvec_iter iter = rbio->bvec_iter; 582 unsigned flags = rbio->flags; 583 subvol_inum inum = { 584 .subvol = rbio->subvol, 585 .inum = rbio->read_pos.inode, 586 }; 587 struct bch_io_failures failed = { .nr = 0 }; 588 589 struct btree_trans *trans = bch2_trans_get(c); 590 591 struct bkey_buf sk; 592 bch2_bkey_buf_init(&sk); 593 bkey_init(&sk.k->k); 594 595 trace_io_read_retry(&rbio->bio); 596 this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], 597 bvec_iter_sectors(rbio->bvec_iter)); 598 599 get_rbio_extent(trans, rbio, &sk); 600 601 if (!bkey_deleted(&sk.k->k) && 602 bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) 603 bch2_mark_io_failure(&failed, &rbio->pick, 604 rbio->ret == -BCH_ERR_data_read_retry_csum_err); 605 606 if (!rbio->split) { 607 rbio->bio.bi_status = 0; 608 rbio->ret = 0; 609 } 610 611 unsigned subvol = rbio->subvol; 612 struct bpos read_pos = rbio->read_pos; 613 614 rbio = bch2_rbio_free(rbio); 615 616 flags |= BCH_READ_in_retry; 617 flags &= ~BCH_READ_may_promote; 618 flags &= ~BCH_READ_last_fragment; 619 flags |= BCH_READ_must_clone; 620 621 int ret = rbio->data_update 622 ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) 623 : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); 624 625 if (ret) { 626 rbio->ret = ret; 627 rbio->bio.bi_status = BLK_STS_IOERR; 628 } 629 630 if (failed.nr || ret) { 631 struct printbuf buf = PRINTBUF; 632 bch2_log_msg_start(c, &buf); 633 634 lockrestart_do(trans, 635 bch2_inum_offset_err_msg_trans(trans, &buf, 636 (subvol_inum) { subvol, read_pos.inode }, 637 read_pos.offset << 9)); 638 if (rbio->data_update) 639 prt_str(&buf, "(internal move) "); 640 641 prt_str(&buf, "data read error, "); 642 if (!ret) { 643 prt_str(&buf, "successful retry"); 644 if (rbio->self_healing) 645 prt_str(&buf, ", self healing"); 646 } else 647 prt_str(&buf, bch2_err_str(ret)); 648 prt_newline(&buf); 649 650 651 if (!bkey_deleted(&sk.k->k)) { 652 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); 653 prt_newline(&buf); 654 } 655 656 bch2_io_failures_to_text(&buf, c, &failed); 657 658 bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); 659 printbuf_exit(&buf); 660 } 661 662 bch2_rbio_done(rbio); 663 bch2_bkey_buf_exit(&sk, c); 664 bch2_trans_put(trans); 665 } 666 667 static void bch2_rbio_error(struct bch_read_bio *rbio, 668 int ret, blk_status_t blk_error) 669 { 670 BUG_ON(ret >= 0); 671 672 rbio->ret = ret; 673 rbio->bio.bi_status = blk_error; 674 675 bch2_rbio_parent(rbio)->saw_error = true; 676 677 if (rbio->flags & BCH_READ_in_retry) 678 return; 679 680 if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { 681 bch2_rbio_punt(rbio, bch2_rbio_retry, 682 RBIO_CONTEXT_UNBOUND, system_unbound_wq); 683 } else { 684 rbio = bch2_rbio_free(rbio); 685 686 rbio->ret = ret; 687 rbio->bio.bi_status = blk_error; 688 689 bch2_rbio_done(rbio); 690 } 691 } 692 693 static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, 694 struct bch_read_bio *rbio) 695 { 696 struct bch_fs *c = rbio->c; 697 u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; 698 struct bch_extent_crc_unpacked new_crc; 699 struct btree_iter iter; 700 struct bkey_i *new; 701 struct bkey_s_c k; 702 int ret = 0; 703 704 if (crc_is_compressed(rbio->pick.crc)) 705 return 0; 706 707 k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, 708 BTREE_ITER_slots|BTREE_ITER_intent); 709 if ((ret = bkey_err(k))) 710 goto out; 711 712 if (bversion_cmp(k.k->bversion, rbio->version) || 713 !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) 714 goto out; 715 716 /* Extent was merged? */ 717 if (bkey_start_offset(k.k) < data_offset || 718 k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) 719 goto out; 720 721 if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, 722 rbio->pick.crc, NULL, &new_crc, 723 bkey_start_offset(k.k) - data_offset, k.k->size, 724 rbio->pick.crc.csum_type)) { 725 bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); 726 ret = 0; 727 goto out; 728 } 729 730 /* 731 * going to be temporarily appending another checksum entry: 732 */ 733 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 734 sizeof(struct bch_extent_crc128)); 735 if ((ret = PTR_ERR_OR_ZERO(new))) 736 goto out; 737 738 bkey_reassemble(new, k); 739 740 if (!bch2_bkey_narrow_crcs(new, new_crc)) 741 goto out; 742 743 ret = bch2_trans_update(trans, &iter, new, 744 BTREE_UPDATE_internal_snapshot_node); 745 out: 746 bch2_trans_iter_exit(trans, &iter); 747 return ret; 748 } 749 750 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) 751 { 752 bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 753 __bch2_rbio_narrow_crcs(trans, rbio)); 754 } 755 756 static void bch2_read_decompress_err(struct work_struct *work) 757 { 758 struct bch_read_bio *rbio = 759 container_of(work, struct bch_read_bio, work); 760 struct bch_fs *c = rbio->c; 761 struct printbuf buf = PRINTBUF; 762 763 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 764 prt_str(&buf, "decompression error"); 765 766 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 767 if (ca) 768 bch_err_ratelimited(ca, "%s", buf.buf); 769 else 770 bch_err_ratelimited(c, "%s", buf.buf); 771 772 bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); 773 printbuf_exit(&buf); 774 } 775 776 static void bch2_read_decrypt_err(struct work_struct *work) 777 { 778 struct bch_read_bio *rbio = 779 container_of(work, struct bch_read_bio, work); 780 struct bch_fs *c = rbio->c; 781 struct printbuf buf = PRINTBUF; 782 783 bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); 784 prt_str(&buf, "decrypt error"); 785 786 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 787 if (ca) 788 bch_err_ratelimited(ca, "%s", buf.buf); 789 else 790 bch_err_ratelimited(c, "%s", buf.buf); 791 792 bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); 793 printbuf_exit(&buf); 794 } 795 796 /* Inner part that may run in process context */ 797 static void __bch2_read_endio(struct work_struct *work) 798 { 799 struct bch_read_bio *rbio = 800 container_of(work, struct bch_read_bio, work); 801 struct bch_fs *c = rbio->c; 802 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 803 struct bch_read_bio *parent = bch2_rbio_parent(rbio); 804 struct bio *src = &rbio->bio; 805 struct bio *dst = &parent->bio; 806 struct bvec_iter dst_iter = rbio->bvec_iter; 807 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 808 struct nonce nonce = extent_nonce(rbio->version, crc); 809 unsigned nofs_flags; 810 struct bch_csum csum; 811 int ret; 812 813 nofs_flags = memalloc_nofs_save(); 814 815 /* Reset iterator for checksumming and copying bounced data: */ 816 if (rbio->bounce) { 817 src->bi_iter.bi_size = crc.compressed_size << 9; 818 src->bi_iter.bi_idx = 0; 819 src->bi_iter.bi_bvec_done = 0; 820 } else { 821 src->bi_iter = rbio->bvec_iter; 822 } 823 824 bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); 825 826 csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); 827 bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; 828 829 /* 830 * Checksum error: if the bio wasn't bounced, we may have been 831 * reading into buffers owned by userspace (that userspace can 832 * scribble over) - retry the read, bouncing it this time: 833 */ 834 if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { 835 rbio->flags |= BCH_READ_must_bounce; 836 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, 837 BLK_STS_IOERR); 838 goto out; 839 } 840 841 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 842 843 if (!csum_good) 844 goto csum_err; 845 846 /* 847 * XXX 848 * We need to rework the narrow_crcs path to deliver the read completion 849 * first, and then punt to a different workqueue, otherwise we're 850 * holding up reads while doing btree updates which is bad for memory 851 * reclaim. 852 */ 853 if (unlikely(rbio->narrow_crcs)) 854 bch2_rbio_narrow_crcs(rbio); 855 856 if (likely(!parent->data_update)) { 857 /* Adjust crc to point to subset of data we want: */ 858 crc.offset += rbio->offset_into_extent; 859 crc.live_size = bvec_iter_sectors(rbio->bvec_iter); 860 861 if (crc_is_compressed(crc)) { 862 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 863 if (ret) 864 goto decrypt_err; 865 866 if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && 867 !c->opts.no_data_io) 868 goto decompression_err; 869 } else { 870 /* don't need to decrypt the entire bio: */ 871 nonce = nonce_add(nonce, crc.offset << 9); 872 bio_advance(src, crc.offset << 9); 873 874 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); 875 src->bi_iter.bi_size = dst_iter.bi_size; 876 877 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 878 if (ret) 879 goto decrypt_err; 880 881 if (rbio->bounce) { 882 struct bvec_iter src_iter = src->bi_iter; 883 884 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 885 } 886 } 887 } else { 888 if (rbio->split) 889 rbio->parent->pick = rbio->pick; 890 891 if (rbio->bounce) { 892 struct bvec_iter src_iter = src->bi_iter; 893 894 bio_copy_data_iter(dst, &dst_iter, src, &src_iter); 895 } 896 } 897 898 if (rbio->promote) { 899 /* 900 * Re encrypt data we decrypted, so it's consistent with 901 * rbio->crc: 902 */ 903 ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); 904 if (ret) 905 goto decrypt_err; 906 } 907 908 if (likely(!(rbio->flags & BCH_READ_in_retry))) { 909 rbio = bch2_rbio_free(rbio); 910 bch2_rbio_done(rbio); 911 } 912 out: 913 memalloc_nofs_restore(nofs_flags); 914 return; 915 csum_err: 916 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); 917 goto out; 918 decompression_err: 919 bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 920 goto out; 921 decrypt_err: 922 bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); 923 goto out; 924 } 925 926 static void bch2_read_endio(struct bio *bio) 927 { 928 struct bch_read_bio *rbio = 929 container_of(bio, struct bch_read_bio, bio); 930 struct bch_fs *c = rbio->c; 931 struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; 932 struct workqueue_struct *wq = NULL; 933 enum rbio_context context = RBIO_CONTEXT_NULL; 934 935 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 936 rbio->submit_time, !bio->bi_status); 937 938 if (!rbio->split) 939 rbio->bio.bi_end_io = rbio->end_io; 940 941 if (unlikely(bio->bi_status)) { 942 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); 943 return; 944 } 945 946 if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || 947 (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { 948 trace_and_count(c, io_read_reuse_race, &rbio->bio); 949 950 if (rbio->flags & BCH_READ_retry_if_stale) 951 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); 952 else 953 bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); 954 return; 955 } 956 957 if (rbio->narrow_crcs || 958 rbio->promote || 959 crc_is_compressed(rbio->pick.crc) || 960 bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) 961 context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; 962 else if (rbio->pick.crc.csum_type) 963 context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; 964 965 bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); 966 } 967 968 static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, 969 struct bch_dev *ca, 970 struct bkey_s_c k, 971 struct bch_extent_ptr ptr) 972 { 973 struct bch_fs *c = trans->c; 974 struct btree_iter iter; 975 struct printbuf buf = PRINTBUF; 976 int ret; 977 978 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 979 PTR_BUCKET_POS(ca, &ptr), 980 BTREE_ITER_cached); 981 982 int gen = bucket_gen_get(ca, iter.pos.offset); 983 if (gen >= 0) { 984 prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); 985 printbuf_indent_add(&buf, 2); 986 987 bch2_bkey_val_to_text(&buf, c, k); 988 prt_newline(&buf); 989 990 prt_printf(&buf, "memory gen: %u", gen); 991 992 ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); 993 if (!ret) { 994 prt_newline(&buf); 995 bch2_bkey_val_to_text(&buf, c, k); 996 } 997 } else { 998 prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", 999 iter.pos.inode, iter.pos.offset); 1000 printbuf_indent_add(&buf, 2); 1001 1002 prt_printf(&buf, "first bucket %u nbuckets %llu\n", 1003 ca->mi.first_bucket, ca->mi.nbuckets); 1004 1005 bch2_bkey_val_to_text(&buf, c, k); 1006 prt_newline(&buf); 1007 } 1008 1009 bch2_fs_inconsistent(c, "%s", buf.buf); 1010 1011 bch2_trans_iter_exit(trans, &iter); 1012 printbuf_exit(&buf); 1013 } 1014 1015 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, 1016 struct bvec_iter iter, struct bpos read_pos, 1017 enum btree_id data_btree, struct bkey_s_c k, 1018 unsigned offset_into_extent, 1019 struct bch_io_failures *failed, unsigned flags, int dev) 1020 { 1021 struct bch_fs *c = trans->c; 1022 struct extent_ptr_decoded pick; 1023 struct bch_read_bio *rbio = NULL; 1024 bool bounce = false, read_full = false, narrow_crcs = false; 1025 struct bpos data_pos = bkey_start_pos(k.k); 1026 struct data_update *u = rbio_data_update(orig); 1027 int ret = 0; 1028 1029 if (bkey_extent_is_inline_data(k.k)) { 1030 unsigned bytes = min_t(unsigned, iter.bi_size, 1031 bkey_inline_data_bytes(k.k)); 1032 1033 swap(iter.bi_size, bytes); 1034 memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); 1035 swap(iter.bi_size, bytes); 1036 bio_advance_iter(&orig->bio, &iter, bytes); 1037 zero_fill_bio_iter(&orig->bio, iter); 1038 this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], 1039 bvec_iter_sectors(iter)); 1040 goto out_read_done; 1041 } 1042 1043 if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && 1044 !orig->data_update) 1045 return bch_err_throw(c, extent_poisoned); 1046 retry_pick: 1047 ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); 1048 1049 /* hole or reservation - just zero fill: */ 1050 if (!ret) 1051 goto hole; 1052 1053 if (unlikely(ret < 0)) { 1054 if (ret == -BCH_ERR_data_read_csum_err) { 1055 int ret2 = maybe_poison_extent(trans, orig, data_btree, k); 1056 if (ret2) { 1057 ret = ret2; 1058 goto err; 1059 } 1060 1061 trace_and_count(c, io_read_fail_and_poison, &orig->bio); 1062 } 1063 1064 struct printbuf buf = PRINTBUF; 1065 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1066 prt_printf(&buf, "%s\n ", bch2_err_str(ret)); 1067 bch2_bkey_val_to_text(&buf, c, k); 1068 1069 bch_err_ratelimited(c, "%s", buf.buf); 1070 printbuf_exit(&buf); 1071 goto err; 1072 } 1073 1074 if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && 1075 !c->chacha20_key_set) { 1076 struct printbuf buf = PRINTBUF; 1077 bch2_read_err_msg_trans(trans, &buf, orig, read_pos); 1078 prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); 1079 bch2_bkey_val_to_text(&buf, c, k); 1080 1081 bch_err_ratelimited(c, "%s", buf.buf); 1082 printbuf_exit(&buf); 1083 ret = bch_err_throw(c, data_read_no_encryption_key); 1084 goto err; 1085 } 1086 1087 struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, 1088 BCH_DEV_READ_REF_io_read); 1089 1090 /* 1091 * Stale dirty pointers are treated as IO errors, but @failed isn't 1092 * allocated unless we're in the retry path - so if we're not in the 1093 * retry path, don't check here, it'll be caught in bch2_read_endio() 1094 * and we'll end up in the retry path: 1095 */ 1096 if ((flags & BCH_READ_in_retry) && 1097 !pick.ptr.cached && 1098 ca && 1099 unlikely(dev_ptr_stale(ca, &pick.ptr))) { 1100 read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); 1101 bch2_mark_io_failure(failed, &pick, false); 1102 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); 1103 goto retry_pick; 1104 } 1105 1106 if (likely(!u)) { 1107 if (!(flags & BCH_READ_last_fragment) || 1108 bio_flagged(&orig->bio, BIO_CHAIN)) 1109 flags |= BCH_READ_must_clone; 1110 1111 narrow_crcs = !(flags & BCH_READ_in_retry) && 1112 bch2_can_narrow_extent_crcs(k, pick.crc); 1113 1114 if (narrow_crcs && (flags & BCH_READ_user_mapped)) 1115 flags |= BCH_READ_must_bounce; 1116 1117 EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); 1118 1119 if (crc_is_compressed(pick.crc) || 1120 (pick.crc.csum_type != BCH_CSUM_none && 1121 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1122 (bch2_csum_type_is_encryption(pick.crc.csum_type) && 1123 (flags & BCH_READ_user_mapped)) || 1124 (flags & BCH_READ_must_bounce)))) { 1125 read_full = true; 1126 bounce = true; 1127 } 1128 } else { 1129 /* 1130 * can happen if we retry, and the extent we were going to read 1131 * has been merged in the meantime: 1132 */ 1133 if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { 1134 if (ca) 1135 enumerated_ref_put(&ca->io_ref[READ], 1136 BCH_DEV_READ_REF_io_read); 1137 rbio->ret = bch_err_throw(c, data_read_buffer_too_small); 1138 goto out_read_done; 1139 } 1140 1141 iter.bi_size = pick.crc.compressed_size << 9; 1142 read_full = true; 1143 } 1144 1145 if (orig->opts.promote_target || have_io_error(failed)) 1146 rbio = promote_alloc(trans, iter, k, &pick, flags, orig, 1147 &bounce, &read_full, failed); 1148 1149 if (!read_full) { 1150 EBUG_ON(crc_is_compressed(pick.crc)); 1151 EBUG_ON(pick.crc.csum_type && 1152 (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || 1153 bvec_iter_sectors(iter) != pick.crc.live_size || 1154 pick.crc.offset || 1155 offset_into_extent)); 1156 1157 data_pos.offset += offset_into_extent; 1158 pick.ptr.offset += pick.crc.offset + 1159 offset_into_extent; 1160 offset_into_extent = 0; 1161 pick.crc.compressed_size = bvec_iter_sectors(iter); 1162 pick.crc.uncompressed_size = bvec_iter_sectors(iter); 1163 pick.crc.offset = 0; 1164 pick.crc.live_size = bvec_iter_sectors(iter); 1165 } 1166 1167 if (rbio) { 1168 /* 1169 * promote already allocated bounce rbio: 1170 * promote needs to allocate a bio big enough for uncompressing 1171 * data in the write path, but we're not going to use it all 1172 * here: 1173 */ 1174 EBUG_ON(rbio->bio.bi_iter.bi_size < 1175 pick.crc.compressed_size << 9); 1176 rbio->bio.bi_iter.bi_size = 1177 pick.crc.compressed_size << 9; 1178 } else if (bounce) { 1179 unsigned sectors = pick.crc.compressed_size; 1180 1181 rbio = rbio_init_fragment(bio_alloc_bioset(NULL, 1182 DIV_ROUND_UP(sectors, PAGE_SECTORS), 1183 0, 1184 GFP_NOFS, 1185 &c->bio_read_split), 1186 orig); 1187 1188 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); 1189 rbio->bounce = true; 1190 } else if (flags & BCH_READ_must_clone) { 1191 /* 1192 * Have to clone if there were any splits, due to error 1193 * reporting issues (if a split errored, and retrying didn't 1194 * work, when it reports the error to its parent (us) we don't 1195 * know if the error was from our bio, and we should retry, or 1196 * from the whole bio, in which case we don't want to retry and 1197 * lose the error) 1198 */ 1199 rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, 1200 &c->bio_read_split), 1201 orig); 1202 rbio->bio.bi_iter = iter; 1203 } else { 1204 rbio = orig; 1205 rbio->bio.bi_iter = iter; 1206 EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); 1207 } 1208 1209 EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); 1210 1211 rbio->submit_time = local_clock(); 1212 if (!rbio->split) 1213 rbio->end_io = orig->bio.bi_end_io; 1214 rbio->bvec_iter = iter; 1215 rbio->offset_into_extent= offset_into_extent; 1216 rbio->flags = flags; 1217 rbio->have_ioref = ca != NULL; 1218 rbio->narrow_crcs = narrow_crcs; 1219 rbio->ret = 0; 1220 rbio->context = 0; 1221 rbio->pick = pick; 1222 rbio->subvol = orig->subvol; 1223 rbio->read_pos = read_pos; 1224 rbio->data_btree = data_btree; 1225 rbio->data_pos = data_pos; 1226 rbio->version = k.k->bversion; 1227 INIT_WORK(&rbio->work, NULL); 1228 1229 rbio->bio.bi_opf = orig->bio.bi_opf; 1230 rbio->bio.bi_iter.bi_sector = pick.ptr.offset; 1231 rbio->bio.bi_end_io = bch2_read_endio; 1232 1233 async_object_list_add(c, rbio, rbio, &rbio->list_idx); 1234 1235 if (rbio->bounce) 1236 trace_and_count(c, io_read_bounce, &rbio->bio); 1237 1238 if (!u) 1239 this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); 1240 else 1241 this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); 1242 bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); 1243 1244 /* 1245 * If it's being moved internally, we don't want to flag it as a cache 1246 * hit: 1247 */ 1248 if (ca && pick.ptr.cached && !u) 1249 bch2_bucket_io_time_reset(trans, pick.ptr.dev, 1250 PTR_BUCKET_NR(ca, &pick.ptr), READ); 1251 1252 if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { 1253 bio_inc_remaining(&orig->bio); 1254 trace_and_count(c, io_read_split, &orig->bio); 1255 } 1256 1257 /* 1258 * Unlock the iterator while the btree node's lock is still in 1259 * cache, before doing the IO: 1260 */ 1261 if (!(flags & BCH_READ_in_retry)) 1262 bch2_trans_unlock(trans); 1263 else 1264 bch2_trans_unlock_long(trans); 1265 1266 if (likely(!rbio->pick.do_ec_reconstruct)) { 1267 if (unlikely(!rbio->have_ioref)) { 1268 bch2_rbio_error(rbio, 1269 -BCH_ERR_data_read_retry_device_offline, 1270 BLK_STS_IOERR); 1271 goto out; 1272 } 1273 1274 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], 1275 bio_sectors(&rbio->bio)); 1276 bio_set_dev(&rbio->bio, ca->disk_sb.bdev); 1277 1278 if (unlikely(c->opts.no_data_io)) { 1279 if (likely(!(flags & BCH_READ_in_retry))) 1280 bio_endio(&rbio->bio); 1281 } else { 1282 if (likely(!(flags & BCH_READ_in_retry))) 1283 submit_bio(&rbio->bio); 1284 else 1285 submit_bio_wait(&rbio->bio); 1286 } 1287 1288 /* 1289 * We just submitted IO which may block, we expect relock fail 1290 * events and shouldn't count them: 1291 */ 1292 trans->notrace_relock_fail = true; 1293 } else { 1294 /* Attempting reconstruct read: */ 1295 if (bch2_ec_read_extent(trans, rbio, k)) { 1296 bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, 1297 BLK_STS_IOERR); 1298 goto out; 1299 } 1300 1301 if (likely(!(flags & BCH_READ_in_retry))) 1302 bio_endio(&rbio->bio); 1303 } 1304 out: 1305 if (likely(!(flags & BCH_READ_in_retry))) { 1306 return 0; 1307 } else { 1308 bch2_trans_unlock(trans); 1309 1310 int ret; 1311 1312 rbio->context = RBIO_CONTEXT_UNBOUND; 1313 bch2_read_endio(&rbio->bio); 1314 1315 ret = rbio->ret; 1316 rbio = bch2_rbio_free(rbio); 1317 1318 if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) 1319 bch2_mark_io_failure(failed, &pick, 1320 ret == -BCH_ERR_data_read_retry_csum_err); 1321 1322 return ret; 1323 } 1324 1325 err: 1326 if (flags & BCH_READ_in_retry) 1327 return ret; 1328 1329 orig->bio.bi_status = BLK_STS_IOERR; 1330 orig->ret = ret; 1331 goto out_read_done; 1332 1333 hole: 1334 this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], 1335 bvec_iter_sectors(iter)); 1336 /* 1337 * won't normally happen in the data update (bch2_move_extent()) path, 1338 * but if we retry and the extent we wanted to read no longer exists we 1339 * have to signal that: 1340 */ 1341 if (u) 1342 orig->ret = bch_err_throw(c, data_read_key_overwritten); 1343 1344 zero_fill_bio_iter(&orig->bio, iter); 1345 out_read_done: 1346 if ((flags & BCH_READ_last_fragment) && 1347 !(flags & BCH_READ_in_retry)) 1348 bch2_rbio_done(orig); 1349 return 0; 1350 } 1351 1352 int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, 1353 struct bvec_iter bvec_iter, subvol_inum inum, 1354 struct bch_io_failures *failed, 1355 struct bkey_buf *prev_read, 1356 unsigned flags) 1357 { 1358 struct bch_fs *c = trans->c; 1359 struct btree_iter iter; 1360 struct bkey_buf sk; 1361 struct bkey_s_c k; 1362 enum btree_id data_btree; 1363 int ret; 1364 1365 EBUG_ON(rbio->data_update); 1366 1367 bch2_bkey_buf_init(&sk); 1368 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, 1369 POS(inum.inum, bvec_iter.bi_sector), 1370 BTREE_ITER_slots); 1371 1372 while (1) { 1373 data_btree = BTREE_ID_extents; 1374 1375 bch2_trans_begin(trans); 1376 1377 u32 snapshot; 1378 ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); 1379 if (ret) 1380 goto err; 1381 1382 bch2_btree_iter_set_snapshot(trans, &iter, snapshot); 1383 1384 bch2_btree_iter_set_pos(trans, &iter, 1385 POS(inum.inum, bvec_iter.bi_sector)); 1386 1387 k = bch2_btree_iter_peek_slot(trans, &iter); 1388 ret = bkey_err(k); 1389 if (ret) 1390 goto err; 1391 1392 s64 offset_into_extent = iter.pos.offset - 1393 bkey_start_offset(k.k); 1394 unsigned sectors = k.k->size - offset_into_extent; 1395 1396 bch2_bkey_buf_reassemble(&sk, c, k); 1397 1398 ret = bch2_read_indirect_extent(trans, &data_btree, 1399 &offset_into_extent, &sk); 1400 if (ret) 1401 goto err; 1402 1403 k = bkey_i_to_s_c(sk.k); 1404 1405 if (unlikely(flags & BCH_READ_in_retry)) { 1406 if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) 1407 failed->nr = 0; 1408 bch2_bkey_buf_copy(prev_read, c, sk.k); 1409 } 1410 1411 /* 1412 * With indirect extents, the amount of data to read is the min 1413 * of the original extent and the indirect extent: 1414 */ 1415 sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); 1416 1417 unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; 1418 swap(bvec_iter.bi_size, bytes); 1419 1420 if (bvec_iter.bi_size == bytes) 1421 flags |= BCH_READ_last_fragment; 1422 1423 ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, 1424 data_btree, k, 1425 offset_into_extent, failed, flags, -1); 1426 swap(bvec_iter.bi_size, bytes); 1427 1428 if (ret) 1429 goto err; 1430 1431 if (flags & BCH_READ_last_fragment) 1432 break; 1433 1434 bio_advance_iter(&rbio->bio, &bvec_iter, bytes); 1435 err: 1436 if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) 1437 flags |= BCH_READ_must_bounce; 1438 1439 if (ret && 1440 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 1441 !bch2_err_matches(ret, BCH_ERR_data_read_retry)) 1442 break; 1443 } 1444 1445 if (unlikely(ret)) { 1446 if (ret != -BCH_ERR_extent_poisoned) { 1447 struct printbuf buf = PRINTBUF; 1448 lockrestart_do(trans, 1449 bch2_inum_offset_err_msg_trans(trans, &buf, inum, 1450 bvec_iter.bi_sector << 9)); 1451 prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); 1452 bch_err_ratelimited(c, "%s", buf.buf); 1453 printbuf_exit(&buf); 1454 } 1455 1456 rbio->bio.bi_status = BLK_STS_IOERR; 1457 rbio->ret = ret; 1458 1459 if (!(flags & BCH_READ_in_retry)) 1460 bch2_rbio_done(rbio); 1461 } 1462 1463 bch2_trans_iter_exit(trans, &iter); 1464 bch2_bkey_buf_exit(&sk, c); 1465 return ret; 1466 } 1467 1468 static const char * const bch2_read_bio_flags[] = { 1469 #define x(n) #n, 1470 BCH_READ_FLAGS() 1471 #undef x 1472 NULL 1473 }; 1474 1475 void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) 1476 { 1477 u64 now = local_clock(); 1478 prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); 1479 prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); 1480 1481 if (!rbio->split) 1482 prt_printf(out, "end_io:\t%ps\n", rbio->end_io); 1483 else 1484 prt_printf(out, "parent:\t%px\n", rbio->parent); 1485 1486 prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); 1487 1488 prt_printf(out, "promote:\t%u\n", rbio->promote); 1489 prt_printf(out, "bounce:\t%u\n", rbio->bounce); 1490 prt_printf(out, "split:\t%u\n", rbio->split); 1491 prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); 1492 prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); 1493 prt_printf(out, "context:\t%u\n", rbio->context); 1494 1495 int ret = READ_ONCE(rbio->ret); 1496 if (ret < 0) 1497 prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); 1498 else 1499 prt_printf(out, "ret:\t%i\n", ret); 1500 1501 prt_printf(out, "flags:\t"); 1502 bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); 1503 prt_newline(out); 1504 1505 bch2_bio_to_text(out, &rbio->bio); 1506 } 1507 1508 void bch2_fs_io_read_exit(struct bch_fs *c) 1509 { 1510 if (c->promote_table.tbl) 1511 rhashtable_destroy(&c->promote_table); 1512 bioset_exit(&c->bio_read_split); 1513 bioset_exit(&c->bio_read); 1514 mempool_exit(&c->bio_bounce_pages); 1515 } 1516 1517 int bch2_fs_io_read_init(struct bch_fs *c) 1518 { 1519 if (mempool_init_page_pool(&c->bio_bounce_pages, 1520 max_t(unsigned, 1521 c->opts.btree_node_size, 1522 c->opts.encoded_extent_max) / 1523 PAGE_SIZE, 0)) 1524 return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); 1525 1526 if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), 1527 BIOSET_NEED_BVECS)) 1528 return bch_err_throw(c, ENOMEM_bio_read_init); 1529 1530 if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), 1531 BIOSET_NEED_BVECS)) 1532 return bch_err_throw(c, ENOMEM_bio_read_split_init); 1533 1534 if (rhashtable_init(&c->promote_table, &bch_promote_params)) 1535 return bch_err_throw(c, ENOMEM_promote_table_init); 1536 1537 return 0; 1538 } 1539