1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "backpointers.h" 6 #include "btree_gc.h" 7 #include "btree_node_scan.h" 8 #include "disk_accounting.h" 9 #include "ec.h" 10 #include "fsck.h" 11 #include "inode.h" 12 #include "journal.h" 13 #include "lru.h" 14 #include "logged_ops.h" 15 #include "movinggc.h" 16 #include "rebalance.h" 17 #include "recovery.h" 18 #include "recovery_passes.h" 19 #include "snapshot.h" 20 #include "subvolume.h" 21 #include "super.h" 22 #include "super-io.h" 23 24 const char * const bch2_recovery_passes[] = { 25 #define x(_fn, ...) #_fn, 26 BCH_RECOVERY_PASSES() 27 #undef x 28 NULL 29 }; 30 31 static const u8 passes_to_stable_map[] = { 32 #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, 33 BCH_RECOVERY_PASSES() 34 #undef x 35 }; 36 37 static const u8 passes_from_stable_map[] = { 38 #define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, 39 BCH_RECOVERY_PASSES() 40 #undef x 41 }; 42 43 static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) 44 { 45 return passes_to_stable_map[pass]; 46 } 47 48 u64 bch2_recovery_passes_to_stable(u64 v) 49 { 50 u64 ret = 0; 51 for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) 52 if (v & BIT_ULL(i)) 53 ret |= BIT_ULL(passes_to_stable_map[i]); 54 return ret; 55 } 56 57 static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) 58 { 59 return pass < ARRAY_SIZE(passes_from_stable_map) 60 ? passes_from_stable_map[pass] 61 : 0; 62 } 63 64 u64 bch2_recovery_passes_from_stable(u64 v) 65 { 66 u64 ret = 0; 67 for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) 68 if (v & BIT_ULL(i)) 69 ret |= BIT_ULL(passes_from_stable_map[i]); 70 return ret; 71 } 72 73 static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, 74 enum bch_validate_flags flags, struct printbuf *err) 75 { 76 return 0; 77 } 78 79 static void bch2_sb_recovery_passes_to_text(struct printbuf *out, 80 struct bch_sb *sb, 81 struct bch_sb_field *f) 82 { 83 struct bch_sb_field_recovery_passes *r = 84 field_to_type(f, recovery_passes); 85 unsigned nr = recovery_passes_nr_entries(r); 86 87 if (out->nr_tabstops < 1) 88 printbuf_tabstop_push(out, 32); 89 if (out->nr_tabstops < 2) 90 printbuf_tabstop_push(out, 16); 91 92 prt_printf(out, "Pass\tLast run\tLast runtime\n"); 93 94 for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { 95 if (!i->last_run) 96 continue; 97 98 unsigned idx = i - r->start; 99 100 prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); 101 102 bch2_prt_datetime(out, le64_to_cpu(i->last_run)); 103 prt_tab(out); 104 105 bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); 106 107 if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) 108 prt_str(out, " (no ratelimit)"); 109 110 prt_newline(out); 111 } 112 } 113 114 static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, 115 enum bch_recovery_pass pass) 116 { 117 enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); 118 119 lockdep_assert_held(&c->sb_lock); 120 121 struct bch_sb_field_recovery_passes *r = 122 bch2_sb_field_get(c->disk_sb.sb, recovery_passes); 123 124 if (stable >= recovery_passes_nr_entries(r)) { 125 unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); 126 127 r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); 128 if (!r) { 129 bch_err(c, "error creating recovery_passes sb section"); 130 return NULL; 131 } 132 } 133 134 return r->start + stable; 135 } 136 137 static void bch2_sb_recovery_pass_complete(struct bch_fs *c, 138 enum bch_recovery_pass pass, 139 s64 start_time) 140 { 141 guard(mutex)(&c->sb_lock); 142 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 143 __clear_bit_le64(bch2_recovery_pass_to_stable(pass), 144 ext->recovery_passes_required); 145 146 struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); 147 if (e) { 148 s64 end_time = ktime_get_real_seconds(); 149 e->last_run = cpu_to_le64(end_time); 150 e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); 151 SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); 152 } 153 154 bch2_write_super(c); 155 } 156 157 void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, 158 enum bch_recovery_pass pass) 159 { 160 guard(mutex)(&c->sb_lock); 161 162 struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); 163 if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { 164 SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); 165 bch2_write_super(c); 166 } 167 } 168 169 static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) 170 { 171 enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); 172 bool ret = false; 173 174 lockdep_assert_held(&c->sb_lock); 175 176 struct bch_sb_field_recovery_passes *r = 177 bch2_sb_field_get(c->disk_sb.sb, recovery_passes); 178 179 if (stable < recovery_passes_nr_entries(r)) { 180 struct recovery_pass_entry *i = r->start + stable; 181 182 /* 183 * Ratelimit if the last runtime was more than 1% of the time 184 * since we last ran 185 */ 186 ret = (u64) le32_to_cpu(i->last_runtime) * 100 > 187 ktime_get_real_seconds() - le64_to_cpu(i->last_run); 188 189 if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) 190 ret = false; 191 } 192 193 return ret; 194 } 195 196 const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { 197 .validate = bch2_sb_recovery_passes_validate, 198 .to_text = bch2_sb_recovery_passes_to_text 199 }; 200 201 /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ 202 static int bch2_recovery_pass_empty(struct bch_fs *c) 203 { 204 return 0; 205 } 206 207 static int bch2_set_may_go_rw(struct bch_fs *c) 208 { 209 struct journal_keys *keys = &c->journal_keys; 210 211 /* 212 * After we go RW, the journal keys buffer can't be modified (except for 213 * setting journal_key->overwritten: it will be accessed by multiple 214 * threads 215 */ 216 move_gap(keys, keys->nr); 217 218 set_bit(BCH_FS_may_go_rw, &c->flags); 219 220 if (go_rw_in_recovery(c)) { 221 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { 222 bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); 223 bch2_reconstruct_alloc(c); 224 } 225 226 return bch2_fs_read_write_early(c); 227 } 228 return 0; 229 } 230 231 /* 232 * Make sure root inode is readable while we're still in recovery and can rewind 233 * for repair: 234 */ 235 static int bch2_lookup_root_inode(struct bch_fs *c) 236 { 237 subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; 238 struct bch_inode_unpacked inode_u; 239 struct bch_subvolume subvol; 240 241 return bch2_trans_do(c, 242 bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: 243 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); 244 } 245 246 struct recovery_pass_fn { 247 int (*fn)(struct bch_fs *); 248 unsigned when; 249 }; 250 251 static struct recovery_pass_fn recovery_pass_fns[] = { 252 #define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, 253 BCH_RECOVERY_PASSES() 254 #undef x 255 }; 256 257 static u64 bch2_recovery_passes_match(unsigned flags) 258 { 259 u64 ret = 0; 260 261 for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) 262 if (recovery_pass_fns[i].when & flags) 263 ret |= BIT_ULL(i); 264 return ret; 265 } 266 267 u64 bch2_fsck_recovery_passes(void) 268 { 269 return bch2_recovery_passes_match(PASS_FSCK); 270 } 271 272 static void bch2_run_async_recovery_passes(struct bch_fs *c) 273 { 274 if (!down_trylock(&c->recovery.run_lock)) 275 return; 276 277 if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) 278 goto unlock; 279 280 if (queue_work(system_long_wq, &c->recovery.work)) 281 return; 282 283 enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); 284 unlock: 285 up(&c->recovery.run_lock); 286 } 287 288 static bool recovery_pass_needs_set(struct bch_fs *c, 289 enum bch_recovery_pass pass, 290 enum bch_run_recovery_pass_flags *flags) 291 { 292 struct bch_fs_recovery *r = &c->recovery; 293 294 /* 295 * Never run scan_for_btree_nodes persistently: check_topology will run 296 * it if required 297 */ 298 if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) 299 *flags |= RUN_RECOVERY_PASS_nopersistent; 300 301 if ((*flags & RUN_RECOVERY_PASS_ratelimit) && 302 !bch2_recovery_pass_want_ratelimit(c, pass)) 303 *flags &= ~RUN_RECOVERY_PASS_ratelimit; 304 305 /* 306 * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do 307 * anything if the pass has already run: these mean we need a prior pass 308 * to run before we continue to repair, we don't expect that pass to fix 309 * the damage we encountered. 310 * 311 * Otherwise, we run run_explicit_recovery_pass when we find damage, so 312 * it should run again even if it's already run: 313 */ 314 bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); 315 bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); 316 bool rewind = in_recovery && 317 r->curr_pass > pass && 318 !(r->passes_complete & BIT_ULL(pass)); 319 320 if (persistent 321 ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) 322 : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) 323 return true; 324 325 if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && 326 (r->passes_ratelimiting & BIT_ULL(pass))) 327 return true; 328 329 if (rewind) 330 return true; 331 332 return false; 333 } 334 335 /* 336 * For when we need to rewind recovery passes and run a pass we skipped: 337 */ 338 int __bch2_run_explicit_recovery_pass(struct bch_fs *c, 339 struct printbuf *out, 340 enum bch_recovery_pass pass, 341 enum bch_run_recovery_pass_flags flags) 342 { 343 struct bch_fs_recovery *r = &c->recovery; 344 int ret = 0; 345 346 lockdep_assert_held(&c->sb_lock); 347 348 bch2_printbuf_make_room(out, 1024); 349 out->atomic++; 350 351 unsigned long lockflags; 352 spin_lock_irqsave(&r->lock, lockflags); 353 354 if (!recovery_pass_needs_set(c, pass, &flags)) 355 goto out; 356 357 bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); 358 bool rewind = in_recovery && 359 r->curr_pass > pass && 360 !(r->passes_complete & BIT_ULL(pass)); 361 bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; 362 363 if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { 364 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 365 __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); 366 } 367 368 if (pass < BCH_RECOVERY_PASS_set_may_go_rw && 369 (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { 370 prt_printf(out, "need recovery pass %s (%u), but already rw\n", 371 bch2_recovery_passes[pass], pass); 372 ret = bch_err_throw(c, cannot_rewind_recovery); 373 goto out; 374 } 375 376 if (ratelimit) 377 r->passes_ratelimiting |= BIT_ULL(pass); 378 else 379 r->passes_ratelimiting &= ~BIT_ULL(pass); 380 381 if (in_recovery && !ratelimit) { 382 prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", 383 bch2_recovery_passes[pass], pass, 384 bch2_recovery_passes[r->curr_pass], r->curr_pass, 385 rewind ? " - rewinding" : ""); 386 387 r->passes_to_run |= BIT_ULL(pass); 388 389 if (rewind) { 390 r->next_pass = pass; 391 r->passes_complete &= (1ULL << pass) >> 1; 392 ret = bch_err_throw(c, restart_recovery); 393 } 394 } else { 395 prt_printf(out, "scheduling recovery pass %s (%u)%s\n", 396 bch2_recovery_passes[pass], pass, 397 ratelimit ? " - ratelimiting" : ""); 398 399 struct recovery_pass_fn *p = recovery_pass_fns + pass; 400 if (p->when & PASS_ONLINE) 401 bch2_run_async_recovery_passes(c); 402 } 403 out: 404 spin_unlock_irqrestore(&r->lock, lockflags); 405 --out->atomic; 406 return ret; 407 } 408 409 int bch2_run_explicit_recovery_pass(struct bch_fs *c, 410 struct printbuf *out, 411 enum bch_recovery_pass pass, 412 enum bch_run_recovery_pass_flags flags) 413 { 414 int ret = 0; 415 416 if (recovery_pass_needs_set(c, pass, &flags)) { 417 guard(mutex)(&c->sb_lock); 418 ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); 419 bch2_write_super(c); 420 } 421 422 return ret; 423 } 424 425 /* 426 * Returns 0 if @pass has run recently, otherwise one of 427 * -BCH_ERR_restart_recovery 428 * -BCH_ERR_recovery_pass_will_run 429 */ 430 int bch2_require_recovery_pass(struct bch_fs *c, 431 struct printbuf *out, 432 enum bch_recovery_pass pass) 433 { 434 if (test_bit(BCH_FS_in_recovery, &c->flags) && 435 c->recovery.passes_complete & BIT_ULL(pass)) 436 return 0; 437 438 guard(mutex)(&c->sb_lock); 439 440 if (bch2_recovery_pass_want_ratelimit(c, pass)) 441 return 0; 442 443 enum bch_run_recovery_pass_flags flags = 0; 444 int ret = 0; 445 446 if (recovery_pass_needs_set(c, pass, &flags)) { 447 ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); 448 bch2_write_super(c); 449 } 450 451 return ret ?: bch_err_throw(c, recovery_pass_will_run); 452 } 453 454 int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) 455 { 456 enum bch_run_recovery_pass_flags flags = 0; 457 458 if (!recovery_pass_needs_set(c, pass, &flags)) 459 return 0; 460 461 struct printbuf buf = PRINTBUF; 462 bch2_log_msg_start(c, &buf); 463 464 mutex_lock(&c->sb_lock); 465 int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, 466 RUN_RECOVERY_PASS_nopersistent); 467 mutex_unlock(&c->sb_lock); 468 469 bch2_print_str(c, KERN_NOTICE, buf.buf); 470 printbuf_exit(&buf); 471 return ret; 472 } 473 474 static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) 475 { 476 struct bch_fs_recovery *r = &c->recovery; 477 struct recovery_pass_fn *p = recovery_pass_fns + pass; 478 479 if (!(p->when & PASS_SILENT)) 480 bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), 481 bch2_recovery_passes[pass]); 482 483 s64 start_time = ktime_get_real_seconds(); 484 int ret = p->fn(c); 485 486 r->passes_to_run &= ~BIT_ULL(pass); 487 488 if (ret) { 489 r->passes_failing |= BIT_ULL(pass); 490 return ret; 491 } 492 493 r->passes_failing = 0; 494 495 if (!test_bit(BCH_FS_error, &c->flags)) 496 bch2_sb_recovery_pass_complete(c, pass, start_time); 497 498 if (!(p->when & PASS_SILENT)) 499 bch2_print(c, KERN_CONT " done\n"); 500 501 return 0; 502 } 503 504 static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, 505 bool online) 506 { 507 struct bch_fs_recovery *r = &c->recovery; 508 int ret = 0; 509 510 spin_lock_irq(&r->lock); 511 512 if (online) 513 orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); 514 515 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) 516 orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); 517 518 /* 519 * A failed recovery pass will be retried after another pass succeeds - 520 * but not this iteration. 521 * 522 * This is because some passes depend on repair done by other passes: we 523 * may want to retry, but we don't want to loop on failing passes. 524 */ 525 526 orig_passes_to_run &= ~r->passes_failing; 527 528 r->passes_to_run = orig_passes_to_run; 529 530 while (r->passes_to_run) { 531 unsigned prev_done = r->pass_done; 532 unsigned pass = __ffs64(r->passes_to_run); 533 r->curr_pass = pass; 534 r->next_pass = r->curr_pass + 1; 535 r->passes_to_run &= ~BIT_ULL(pass); 536 537 spin_unlock_irq(&r->lock); 538 539 int ret2 = bch2_run_recovery_pass(c, pass) ?: 540 bch2_journal_flush(&c->journal); 541 542 spin_lock_irq(&r->lock); 543 544 if (r->next_pass < r->curr_pass) { 545 /* Rewind: */ 546 r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); 547 } else if (!ret2) { 548 r->pass_done = max(r->pass_done, pass); 549 r->passes_complete |= BIT_ULL(pass); 550 } else { 551 ret = ret2; 552 } 553 554 if (ret && !online) 555 break; 556 557 if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && 558 r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { 559 bch2_copygc_wakeup(c); 560 bch2_rebalance_wakeup(c); 561 } 562 } 563 564 clear_bit(BCH_FS_in_recovery, &c->flags); 565 spin_unlock_irq(&r->lock); 566 567 return ret; 568 } 569 570 static void bch2_async_recovery_passes_work(struct work_struct *work) 571 { 572 struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); 573 struct bch_fs_recovery *r = &c->recovery; 574 575 __bch2_run_recovery_passes(c, 576 c->sb.recovery_passes_required & ~r->passes_ratelimiting, 577 true); 578 579 up(&r->run_lock); 580 enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); 581 } 582 583 int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) 584 { 585 return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); 586 } 587 588 int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) 589 { 590 u64 passes = 591 bch2_recovery_passes_match(PASS_ALWAYS) | 592 (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | 593 (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | 594 c->opts.recovery_passes | 595 c->sb.recovery_passes_required; 596 597 if (c->opts.recovery_pass_last) 598 passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; 599 600 /* 601 * We can't allow set_may_go_rw to be excluded; that would cause us to 602 * use the journal replay keys for updates where it's not expected. 603 */ 604 c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; 605 passes &= ~c->opts.recovery_passes_exclude; 606 607 passes &= ~(BIT_ULL(from) - 1); 608 609 down(&c->recovery.run_lock); 610 int ret = __bch2_run_recovery_passes(c, passes, false); 611 up(&c->recovery.run_lock); 612 613 return ret; 614 } 615 616 static void prt_passes(struct printbuf *out, const char *msg, u64 passes) 617 { 618 prt_printf(out, "%s:\t", msg); 619 prt_bitflags(out, bch2_recovery_passes, passes); 620 prt_newline(out); 621 } 622 623 void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) 624 { 625 struct bch_fs_recovery *r = &c->recovery; 626 627 printbuf_tabstop_push(out, 32); 628 prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); 629 prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & 630 bch2_recovery_passes_match(PASS_ONLINE)); 631 prt_passes(out, "Complete passes", r->passes_complete); 632 prt_passes(out, "Failing passes", r->passes_failing); 633 634 if (r->curr_pass) { 635 prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); 636 prt_passes(out, "Current passes", r->passes_to_run); 637 } 638 } 639 640 void bch2_fs_recovery_passes_init(struct bch_fs *c) 641 { 642 spin_lock_init(&c->recovery.lock); 643 sema_init(&c->recovery.run_lock, 1); 644 645 INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); 646 } 647