1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/ioprio.h> 21 #include <linux/string_choices.h> 22 #include <linux/sched/sysctl.h> 23 24 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 25 { 26 lockdep_assert_held(&c->sb_lock); 27 28 for_each_member_device(c, ca) { 29 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 30 31 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 32 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 33 } 34 } 35 36 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 37 { 38 mutex_lock(&c->sb_lock); 39 for_each_member_device(c, ca) { 40 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 41 42 unsigned idx = le32_to_cpu(m.last_journal_bucket); 43 if (idx < ca->journal.nr) 44 ca->journal.cur_idx = idx; 45 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 46 if (offset <= ca->mi.bucket_size) 47 ca->journal.sectors_free = ca->mi.bucket_size - offset; 48 } 49 mutex_unlock(&c->sb_lock); 50 } 51 52 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 53 struct journal_replay *j) 54 { 55 darray_for_each(j->ptrs, i) { 56 if (i != j->ptrs.data) 57 prt_printf(out, " "); 58 prt_printf(out, "%u:%u:%u (sector %llu)", 59 i->dev, i->bucket, i->bucket_offset, i->sector); 60 } 61 } 62 63 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 64 struct journal_replay *j) 65 { 66 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 67 68 bch2_journal_ptrs_to_text(out, c, j); 69 70 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 71 struct jset_entry_datetime *datetime = 72 container_of(entry, struct jset_entry_datetime, entry); 73 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 74 break; 75 } 76 } 77 78 static struct nonce journal_nonce(const struct jset *jset) 79 { 80 return (struct nonce) {{ 81 [0] = 0, 82 [1] = ((__le32 *) &jset->seq)[0], 83 [2] = ((__le32 *) &jset->seq)[1], 84 [3] = BCH_NONCE_JOURNAL, 85 }}; 86 } 87 88 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 89 { 90 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 91 *csum = (struct bch_csum) {}; 92 return false; 93 } 94 95 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 96 return !bch2_crc_cmp(j->csum, *csum); 97 } 98 99 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 100 { 101 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 102 } 103 104 static void __journal_replay_free(struct bch_fs *c, 105 struct journal_replay *i) 106 { 107 struct journal_replay **p = 108 genradix_ptr(&c->journal_entries, 109 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 110 111 BUG_ON(*p != i); 112 *p = NULL; 113 kvfree(i); 114 } 115 116 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 117 { 118 if (blacklisted) 119 i->ignore_blacklisted = true; 120 else 121 i->ignore_not_dirty = true; 122 123 if (!c->opts.read_entire_journal) 124 __journal_replay_free(c, i); 125 } 126 127 struct journal_list { 128 struct closure cl; 129 u64 last_seq; 130 struct mutex lock; 131 int ret; 132 }; 133 134 #define JOURNAL_ENTRY_ADD_OK 0 135 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 136 137 /* 138 * Given a journal entry we just read, add it to the list of journal entries to 139 * be replayed: 140 */ 141 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 142 struct journal_ptr entry_ptr, 143 struct journal_list *jlist, struct jset *j) 144 { 145 struct genradix_iter iter; 146 struct journal_replay **_i, *i, *dup; 147 size_t bytes = vstruct_bytes(j); 148 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 149 struct printbuf buf = PRINTBUF; 150 int ret = JOURNAL_ENTRY_ADD_OK; 151 152 if (!c->journal.oldest_seq_found_ondisk || 153 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 154 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 155 156 /* Is this entry older than the range we need? */ 157 if (!c->opts.read_entire_journal && 158 le64_to_cpu(j->seq) < jlist->last_seq) 159 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 160 161 /* 162 * genradixes are indexed by a ulong, not a u64, so we can't index them 163 * by sequence number directly: Assume instead that they will all fall 164 * within the range of +-2billion of the filrst one we find. 165 */ 166 if (!c->journal_entries_base_seq) 167 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 168 169 /* Drop entries we don't need anymore */ 170 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 171 genradix_for_each_from(&c->journal_entries, iter, _i, 172 journal_entry_radix_idx(c, jlist->last_seq)) { 173 i = *_i; 174 175 if (journal_replay_ignore(i)) 176 continue; 177 178 if (le64_to_cpu(i->j.seq) >= last_seq) 179 break; 180 181 journal_replay_free(c, i, false); 182 } 183 } 184 185 jlist->last_seq = max(jlist->last_seq, last_seq); 186 187 _i = genradix_ptr_alloc(&c->journal_entries, 188 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 189 GFP_KERNEL); 190 if (!_i) 191 return -BCH_ERR_ENOMEM_journal_entry_add; 192 193 /* 194 * Duplicate journal entries? If so we want the one that didn't have a 195 * checksum error: 196 */ 197 dup = *_i; 198 if (dup) { 199 bool identical = bytes == vstruct_bytes(&dup->j) && 200 !memcmp(j, &dup->j, bytes); 201 bool not_identical = !identical && 202 entry_ptr.csum_good && 203 dup->csum_good; 204 205 bool same_device = false; 206 darray_for_each(dup->ptrs, ptr) 207 if (ptr->dev == ca->dev_idx) 208 same_device = true; 209 210 ret = darray_push(&dup->ptrs, entry_ptr); 211 if (ret) 212 goto out; 213 214 bch2_journal_replay_to_text(&buf, c, dup); 215 216 fsck_err_on(same_device, 217 c, journal_entry_dup_same_device, 218 "duplicate journal entry on same device\n%s", 219 buf.buf); 220 221 fsck_err_on(not_identical, 222 c, journal_entry_replicas_data_mismatch, 223 "found duplicate but non identical journal entries\n%s", 224 buf.buf); 225 226 if (entry_ptr.csum_good && !identical) 227 goto replace; 228 229 goto out; 230 } 231 replace: 232 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 233 if (!i) 234 return -BCH_ERR_ENOMEM_journal_entry_add; 235 236 darray_init(&i->ptrs); 237 i->csum_good = entry_ptr.csum_good; 238 i->ignore_blacklisted = false; 239 i->ignore_not_dirty = false; 240 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 241 242 if (dup) { 243 /* The first ptr should represent the jset we kept: */ 244 darray_for_each(dup->ptrs, ptr) 245 darray_push(&i->ptrs, *ptr); 246 __journal_replay_free(c, dup); 247 } else { 248 darray_push(&i->ptrs, entry_ptr); 249 } 250 251 *_i = i; 252 out: 253 fsck_err: 254 printbuf_exit(&buf); 255 return ret; 256 } 257 258 /* this fills in a range with empty jset_entries: */ 259 static void journal_entry_null_range(void *start, void *end) 260 { 261 struct jset_entry *entry; 262 263 for (entry = start; entry != end; entry = vstruct_next(entry)) 264 memset(entry, 0, sizeof(*entry)); 265 } 266 267 #define JOURNAL_ENTRY_REREAD 5 268 #define JOURNAL_ENTRY_NONE 6 269 #define JOURNAL_ENTRY_BAD 7 270 271 static void journal_entry_err_msg(struct printbuf *out, 272 u32 version, 273 struct jset *jset, 274 struct jset_entry *entry) 275 { 276 prt_str(out, "invalid journal entry, version="); 277 bch2_version_to_text(out, version); 278 279 if (entry) { 280 prt_str(out, " type="); 281 bch2_prt_jset_entry_type(out, entry->type); 282 } 283 284 if (!jset) { 285 prt_printf(out, " in superblock"); 286 } else { 287 288 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 289 290 if (entry) 291 prt_printf(out, " offset=%zi/%u", 292 (u64 *) entry - jset->_data, 293 le32_to_cpu(jset->u64s)); 294 } 295 296 prt_str(out, ": "); 297 } 298 299 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 300 ({ \ 301 struct printbuf _buf = PRINTBUF; \ 302 \ 303 journal_entry_err_msg(&_buf, version, jset, entry); \ 304 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 305 \ 306 switch (from.flags & BCH_VALIDATE_write) { \ 307 case READ: \ 308 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 309 break; \ 310 case WRITE: \ 311 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 312 if (bch2_fs_inconsistent(c, \ 313 "corrupt metadata before write: %s\n", _buf.buf)) {\ 314 ret = -BCH_ERR_fsck_errors_not_fixed; \ 315 goto fsck_err; \ 316 } \ 317 break; \ 318 } \ 319 \ 320 printbuf_exit(&_buf); \ 321 true; \ 322 }) 323 324 #define journal_entry_err_on(cond, ...) \ 325 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 326 327 #define FSCK_DELETED_KEY 5 328 329 static int journal_validate_key(struct bch_fs *c, 330 struct jset *jset, 331 struct jset_entry *entry, 332 struct bkey_i *k, 333 struct bkey_validate_context from, 334 unsigned version, int big_endian) 335 { 336 enum bch_validate_flags flags = from.flags; 337 int write = flags & BCH_VALIDATE_write; 338 void *next = vstruct_next(entry); 339 int ret = 0; 340 341 if (journal_entry_err_on(!k->k.u64s, 342 c, version, jset, entry, 343 journal_entry_bkey_u64s_0, 344 "k->u64s 0")) { 345 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 346 journal_entry_null_range(vstruct_next(entry), next); 347 return FSCK_DELETED_KEY; 348 } 349 350 if (journal_entry_err_on((void *) bkey_next(k) > 351 (void *) vstruct_next(entry), 352 c, version, jset, entry, 353 journal_entry_bkey_past_end, 354 "extends past end of journal entry")) { 355 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 356 journal_entry_null_range(vstruct_next(entry), next); 357 return FSCK_DELETED_KEY; 358 } 359 360 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 361 c, version, jset, entry, 362 journal_entry_bkey_bad_format, 363 "bad format %u", k->k.format)) { 364 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 365 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 366 journal_entry_null_range(vstruct_next(entry), next); 367 return FSCK_DELETED_KEY; 368 } 369 370 if (!write) 371 bch2_bkey_compat(from.level, from.btree, version, big_endian, 372 write, NULL, bkey_to_packed(k)); 373 374 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 375 if (ret == -BCH_ERR_fsck_delete_bkey) { 376 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 377 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 378 journal_entry_null_range(vstruct_next(entry), next); 379 return FSCK_DELETED_KEY; 380 } 381 if (ret) 382 goto fsck_err; 383 384 if (write) 385 bch2_bkey_compat(from.level, from.btree, version, big_endian, 386 write, NULL, bkey_to_packed(k)); 387 fsck_err: 388 return ret; 389 } 390 391 static int journal_entry_btree_keys_validate(struct bch_fs *c, 392 struct jset *jset, 393 struct jset_entry *entry, 394 unsigned version, int big_endian, 395 struct bkey_validate_context from) 396 { 397 struct bkey_i *k = entry->start; 398 399 from.level = entry->level; 400 from.btree = entry->btree_id; 401 402 while (k != vstruct_last(entry)) { 403 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 404 if (ret == FSCK_DELETED_KEY) 405 continue; 406 else if (ret) 407 return ret; 408 409 k = bkey_next(k); 410 } 411 412 return 0; 413 } 414 415 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 416 struct jset_entry *entry) 417 { 418 bool first = true; 419 420 jset_entry_for_each_key(entry, k) { 421 if (!first) { 422 prt_newline(out); 423 bch2_prt_jset_entry_type(out, entry->type); 424 prt_str(out, ": "); 425 } 426 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 427 prt_char(out, ' '); 428 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 429 first = false; 430 } 431 } 432 433 static int journal_entry_btree_root_validate(struct bch_fs *c, 434 struct jset *jset, 435 struct jset_entry *entry, 436 unsigned version, int big_endian, 437 struct bkey_validate_context from) 438 { 439 struct bkey_i *k = entry->start; 440 int ret = 0; 441 442 from.root = true; 443 from.level = entry->level + 1; 444 from.btree = entry->btree_id; 445 446 if (journal_entry_err_on(!entry->u64s || 447 le16_to_cpu(entry->u64s) != k->k.u64s, 448 c, version, jset, entry, 449 journal_entry_btree_root_bad_size, 450 "invalid btree root journal entry: wrong number of keys")) { 451 void *next = vstruct_next(entry); 452 /* 453 * we don't want to null out this jset_entry, 454 * just the contents, so that later we can tell 455 * we were _supposed_ to have a btree root 456 */ 457 entry->u64s = 0; 458 journal_entry_null_range(vstruct_next(entry), next); 459 return 0; 460 } 461 462 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 463 if (ret == FSCK_DELETED_KEY) 464 ret = 0; 465 fsck_err: 466 return ret; 467 } 468 469 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 470 struct jset_entry *entry) 471 { 472 journal_entry_btree_keys_to_text(out, c, entry); 473 } 474 475 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 476 struct jset *jset, 477 struct jset_entry *entry, 478 unsigned version, int big_endian, 479 struct bkey_validate_context from) 480 { 481 /* obsolete, don't care: */ 482 return 0; 483 } 484 485 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 486 struct jset_entry *entry) 487 { 488 } 489 490 static int journal_entry_blacklist_validate(struct bch_fs *c, 491 struct jset *jset, 492 struct jset_entry *entry, 493 unsigned version, int big_endian, 494 struct bkey_validate_context from) 495 { 496 int ret = 0; 497 498 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 499 c, version, jset, entry, 500 journal_entry_blacklist_bad_size, 501 "invalid journal seq blacklist entry: bad size")) { 502 journal_entry_null_range(entry, vstruct_next(entry)); 503 } 504 fsck_err: 505 return ret; 506 } 507 508 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 509 struct jset_entry *entry) 510 { 511 struct jset_entry_blacklist *bl = 512 container_of(entry, struct jset_entry_blacklist, entry); 513 514 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 515 } 516 517 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 518 struct jset *jset, 519 struct jset_entry *entry, 520 unsigned version, int big_endian, 521 struct bkey_validate_context from) 522 { 523 struct jset_entry_blacklist_v2 *bl_entry; 524 int ret = 0; 525 526 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 527 c, version, jset, entry, 528 journal_entry_blacklist_v2_bad_size, 529 "invalid journal seq blacklist entry: bad size")) { 530 journal_entry_null_range(entry, vstruct_next(entry)); 531 goto out; 532 } 533 534 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 535 536 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 537 le64_to_cpu(bl_entry->end), 538 c, version, jset, entry, 539 journal_entry_blacklist_v2_start_past_end, 540 "invalid journal seq blacklist entry: start > end")) { 541 journal_entry_null_range(entry, vstruct_next(entry)); 542 } 543 out: 544 fsck_err: 545 return ret; 546 } 547 548 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 549 struct jset_entry *entry) 550 { 551 struct jset_entry_blacklist_v2 *bl = 552 container_of(entry, struct jset_entry_blacklist_v2, entry); 553 554 prt_printf(out, "start=%llu end=%llu", 555 le64_to_cpu(bl->start), 556 le64_to_cpu(bl->end)); 557 } 558 559 static int journal_entry_usage_validate(struct bch_fs *c, 560 struct jset *jset, 561 struct jset_entry *entry, 562 unsigned version, int big_endian, 563 struct bkey_validate_context from) 564 { 565 struct jset_entry_usage *u = 566 container_of(entry, struct jset_entry_usage, entry); 567 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 568 int ret = 0; 569 570 if (journal_entry_err_on(bytes < sizeof(*u), 571 c, version, jset, entry, 572 journal_entry_usage_bad_size, 573 "invalid journal entry usage: bad size")) { 574 journal_entry_null_range(entry, vstruct_next(entry)); 575 return ret; 576 } 577 578 fsck_err: 579 return ret; 580 } 581 582 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 583 struct jset_entry *entry) 584 { 585 struct jset_entry_usage *u = 586 container_of(entry, struct jset_entry_usage, entry); 587 588 prt_str(out, "type="); 589 bch2_prt_fs_usage_type(out, u->entry.btree_id); 590 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 591 } 592 593 static int journal_entry_data_usage_validate(struct bch_fs *c, 594 struct jset *jset, 595 struct jset_entry *entry, 596 unsigned version, int big_endian, 597 struct bkey_validate_context from) 598 { 599 struct jset_entry_data_usage *u = 600 container_of(entry, struct jset_entry_data_usage, entry); 601 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 602 struct printbuf err = PRINTBUF; 603 int ret = 0; 604 605 if (journal_entry_err_on(bytes < sizeof(*u) || 606 bytes < sizeof(*u) + u->r.nr_devs, 607 c, version, jset, entry, 608 journal_entry_data_usage_bad_size, 609 "invalid journal entry usage: bad size")) { 610 journal_entry_null_range(entry, vstruct_next(entry)); 611 goto out; 612 } 613 614 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 615 c, version, jset, entry, 616 journal_entry_data_usage_bad_size, 617 "invalid journal entry usage: %s", err.buf)) { 618 journal_entry_null_range(entry, vstruct_next(entry)); 619 goto out; 620 } 621 out: 622 fsck_err: 623 printbuf_exit(&err); 624 return ret; 625 } 626 627 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 628 struct jset_entry *entry) 629 { 630 struct jset_entry_data_usage *u = 631 container_of(entry, struct jset_entry_data_usage, entry); 632 633 bch2_replicas_entry_to_text(out, &u->r); 634 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 635 } 636 637 static int journal_entry_clock_validate(struct bch_fs *c, 638 struct jset *jset, 639 struct jset_entry *entry, 640 unsigned version, int big_endian, 641 struct bkey_validate_context from) 642 { 643 struct jset_entry_clock *clock = 644 container_of(entry, struct jset_entry_clock, entry); 645 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 646 int ret = 0; 647 648 if (journal_entry_err_on(bytes != sizeof(*clock), 649 c, version, jset, entry, 650 journal_entry_clock_bad_size, 651 "bad size")) { 652 journal_entry_null_range(entry, vstruct_next(entry)); 653 return ret; 654 } 655 656 if (journal_entry_err_on(clock->rw > 1, 657 c, version, jset, entry, 658 journal_entry_clock_bad_rw, 659 "bad rw")) { 660 journal_entry_null_range(entry, vstruct_next(entry)); 661 return ret; 662 } 663 664 fsck_err: 665 return ret; 666 } 667 668 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 669 struct jset_entry *entry) 670 { 671 struct jset_entry_clock *clock = 672 container_of(entry, struct jset_entry_clock, entry); 673 674 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 675 } 676 677 static int journal_entry_dev_usage_validate(struct bch_fs *c, 678 struct jset *jset, 679 struct jset_entry *entry, 680 unsigned version, int big_endian, 681 struct bkey_validate_context from) 682 { 683 struct jset_entry_dev_usage *u = 684 container_of(entry, struct jset_entry_dev_usage, entry); 685 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 686 unsigned expected = sizeof(*u); 687 int ret = 0; 688 689 if (journal_entry_err_on(bytes < expected, 690 c, version, jset, entry, 691 journal_entry_dev_usage_bad_size, 692 "bad size (%u < %u)", 693 bytes, expected)) { 694 journal_entry_null_range(entry, vstruct_next(entry)); 695 return ret; 696 } 697 698 if (journal_entry_err_on(u->pad, 699 c, version, jset, entry, 700 journal_entry_dev_usage_bad_pad, 701 "bad pad")) { 702 journal_entry_null_range(entry, vstruct_next(entry)); 703 return ret; 704 } 705 706 fsck_err: 707 return ret; 708 } 709 710 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 711 struct jset_entry *entry) 712 { 713 struct jset_entry_dev_usage *u = 714 container_of(entry, struct jset_entry_dev_usage, entry); 715 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 716 717 if (vstruct_bytes(entry) < sizeof(*u)) 718 return; 719 720 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 721 722 printbuf_indent_add(out, 2); 723 for (i = 0; i < nr_types; i++) { 724 prt_newline(out); 725 bch2_prt_data_type(out, i); 726 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 727 le64_to_cpu(u->d[i].buckets), 728 le64_to_cpu(u->d[i].sectors), 729 le64_to_cpu(u->d[i].fragmented)); 730 } 731 printbuf_indent_sub(out, 2); 732 } 733 734 static int journal_entry_log_validate(struct bch_fs *c, 735 struct jset *jset, 736 struct jset_entry *entry, 737 unsigned version, int big_endian, 738 struct bkey_validate_context from) 739 { 740 return 0; 741 } 742 743 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 744 struct jset_entry *entry) 745 { 746 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 747 748 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 749 } 750 751 static int journal_entry_overwrite_validate(struct bch_fs *c, 752 struct jset *jset, 753 struct jset_entry *entry, 754 unsigned version, int big_endian, 755 struct bkey_validate_context from) 756 { 757 from.flags = 0; 758 return journal_entry_btree_keys_validate(c, jset, entry, 759 version, big_endian, from); 760 } 761 762 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 763 struct jset_entry *entry) 764 { 765 journal_entry_btree_keys_to_text(out, c, entry); 766 } 767 768 static int journal_entry_log_bkey_validate(struct bch_fs *c, 769 struct jset *jset, 770 struct jset_entry *entry, 771 unsigned version, int big_endian, 772 struct bkey_validate_context from) 773 { 774 from.flags = 0; 775 return journal_entry_btree_keys_validate(c, jset, entry, 776 version, big_endian, from); 777 } 778 779 static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, 780 struct jset_entry *entry) 781 { 782 journal_entry_btree_keys_to_text(out, c, entry); 783 } 784 785 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 786 struct jset *jset, 787 struct jset_entry *entry, 788 unsigned version, int big_endian, 789 struct bkey_validate_context from) 790 { 791 return journal_entry_btree_keys_validate(c, jset, entry, 792 version, big_endian, from); 793 } 794 795 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 796 struct jset_entry *entry) 797 { 798 journal_entry_btree_keys_to_text(out, c, entry); 799 } 800 801 static int journal_entry_datetime_validate(struct bch_fs *c, 802 struct jset *jset, 803 struct jset_entry *entry, 804 unsigned version, int big_endian, 805 struct bkey_validate_context from) 806 { 807 unsigned bytes = vstruct_bytes(entry); 808 unsigned expected = 16; 809 int ret = 0; 810 811 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 812 c, version, jset, entry, 813 journal_entry_dev_usage_bad_size, 814 "bad size (%u < %u)", 815 bytes, expected)) { 816 journal_entry_null_range(entry, vstruct_next(entry)); 817 return ret; 818 } 819 fsck_err: 820 return ret; 821 } 822 823 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 824 struct jset_entry *entry) 825 { 826 struct jset_entry_datetime *datetime = 827 container_of(entry, struct jset_entry_datetime, entry); 828 829 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 830 } 831 832 struct jset_entry_ops { 833 int (*validate)(struct bch_fs *, struct jset *, 834 struct jset_entry *, unsigned, int, 835 struct bkey_validate_context); 836 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 837 }; 838 839 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 840 #define x(f, nr) \ 841 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 842 .validate = journal_entry_##f##_validate, \ 843 .to_text = journal_entry_##f##_to_text, \ 844 }, 845 BCH_JSET_ENTRY_TYPES() 846 #undef x 847 }; 848 849 int bch2_journal_entry_validate(struct bch_fs *c, 850 struct jset *jset, 851 struct jset_entry *entry, 852 unsigned version, int big_endian, 853 struct bkey_validate_context from) 854 { 855 return entry->type < BCH_JSET_ENTRY_NR 856 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 857 version, big_endian, from) 858 : 0; 859 } 860 861 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 862 struct jset_entry *entry) 863 { 864 bch2_prt_jset_entry_type(out, entry->type); 865 866 if (entry->type < BCH_JSET_ENTRY_NR) { 867 prt_str(out, ": "); 868 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 869 } 870 } 871 872 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 873 enum bch_validate_flags flags) 874 { 875 struct bkey_validate_context from = { 876 .flags = flags, 877 .from = BKEY_VALIDATE_journal, 878 .journal_seq = le64_to_cpu(jset->seq), 879 }; 880 881 unsigned version = le32_to_cpu(jset->version); 882 int ret = 0; 883 884 vstruct_for_each(jset, entry) { 885 from.journal_offset = (u64 *) entry - jset->_data; 886 887 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 888 c, version, jset, entry, 889 journal_entry_past_jset_end, 890 "journal entry extends past end of jset")) { 891 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 892 break; 893 } 894 895 ret = bch2_journal_entry_validate(c, jset, entry, version, 896 JSET_BIG_ENDIAN(jset), from); 897 if (ret) 898 break; 899 } 900 fsck_err: 901 return ret; 902 } 903 904 static int jset_validate(struct bch_fs *c, 905 struct bch_dev *ca, 906 struct jset *jset, u64 sector, 907 enum bch_validate_flags flags) 908 { 909 struct bkey_validate_context from = { 910 .flags = flags, 911 .from = BKEY_VALIDATE_journal, 912 .journal_seq = le64_to_cpu(jset->seq), 913 }; 914 int ret = 0; 915 916 if (le64_to_cpu(jset->magic) != jset_magic(c)) 917 return JOURNAL_ENTRY_NONE; 918 919 unsigned version = le32_to_cpu(jset->version); 920 if (journal_entry_err_on(!bch2_version_compatible(version), 921 c, version, jset, NULL, 922 jset_unsupported_version, 923 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 924 ca ? ca->name : c->name, 925 sector, le64_to_cpu(jset->seq), 926 BCH_VERSION_MAJOR(version), 927 BCH_VERSION_MINOR(version))) { 928 /* don't try to continue: */ 929 return -EINVAL; 930 } 931 932 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 933 c, version, jset, NULL, 934 jset_unknown_csum, 935 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 936 ca ? ca->name : c->name, 937 sector, le64_to_cpu(jset->seq), 938 JSET_CSUM_TYPE(jset))) 939 ret = JOURNAL_ENTRY_BAD; 940 941 /* last_seq is ignored when JSET_NO_FLUSH is true */ 942 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 943 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 944 c, version, jset, NULL, 945 jset_last_seq_newer_than_seq, 946 "invalid journal entry: last_seq > seq (%llu > %llu)", 947 le64_to_cpu(jset->last_seq), 948 le64_to_cpu(jset->seq))) { 949 jset->last_seq = jset->seq; 950 return JOURNAL_ENTRY_BAD; 951 } 952 953 ret = jset_validate_entries(c, jset, flags); 954 fsck_err: 955 return ret; 956 } 957 958 static int jset_validate_early(struct bch_fs *c, 959 struct bch_dev *ca, 960 struct jset *jset, u64 sector, 961 unsigned bucket_sectors_left, 962 unsigned sectors_read) 963 { 964 struct bkey_validate_context from = { 965 .from = BKEY_VALIDATE_journal, 966 .journal_seq = le64_to_cpu(jset->seq), 967 }; 968 int ret = 0; 969 970 if (le64_to_cpu(jset->magic) != jset_magic(c)) 971 return JOURNAL_ENTRY_NONE; 972 973 unsigned version = le32_to_cpu(jset->version); 974 if (journal_entry_err_on(!bch2_version_compatible(version), 975 c, version, jset, NULL, 976 jset_unsupported_version, 977 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 978 ca ? ca->name : c->name, 979 sector, le64_to_cpu(jset->seq), 980 BCH_VERSION_MAJOR(version), 981 BCH_VERSION_MINOR(version))) { 982 /* don't try to continue: */ 983 return -EINVAL; 984 } 985 986 size_t bytes = vstruct_bytes(jset); 987 if (bytes > (sectors_read << 9) && 988 sectors_read < bucket_sectors_left) 989 return JOURNAL_ENTRY_REREAD; 990 991 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 992 c, version, jset, NULL, 993 jset_past_bucket_end, 994 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 995 ca ? ca->name : c->name, 996 sector, le64_to_cpu(jset->seq), bytes)) 997 le32_add_cpu(&jset->u64s, 998 -((bytes - (bucket_sectors_left << 9)) / 8)); 999 fsck_err: 1000 return ret; 1001 } 1002 1003 struct journal_read_buf { 1004 void *data; 1005 size_t size; 1006 }; 1007 1008 static int journal_read_buf_realloc(struct journal_read_buf *b, 1009 size_t new_size) 1010 { 1011 void *n; 1012 1013 /* the bios are sized for this many pages, max: */ 1014 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 1015 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1016 1017 new_size = roundup_pow_of_two(new_size); 1018 n = kvmalloc(new_size, GFP_KERNEL); 1019 if (!n) 1020 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1021 1022 kvfree(b->data); 1023 b->data = n; 1024 b->size = new_size; 1025 return 0; 1026 } 1027 1028 static int journal_read_bucket(struct bch_dev *ca, 1029 struct journal_read_buf *buf, 1030 struct journal_list *jlist, 1031 unsigned bucket) 1032 { 1033 struct bch_fs *c = ca->fs; 1034 struct journal_device *ja = &ca->journal; 1035 struct jset *j = NULL; 1036 unsigned sectors, sectors_read = 0; 1037 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1038 end = offset + ca->mi.bucket_size; 1039 bool saw_bad = false, csum_good; 1040 struct printbuf err = PRINTBUF; 1041 int ret = 0; 1042 1043 pr_debug("reading %u", bucket); 1044 1045 while (offset < end) { 1046 if (!sectors_read) { 1047 struct bio *bio; 1048 unsigned nr_bvecs; 1049 reread: 1050 sectors_read = min_t(unsigned, 1051 end - offset, buf->size >> 9); 1052 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1053 1054 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1055 if (!bio) 1056 return -BCH_ERR_ENOMEM_journal_read_bucket; 1057 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1058 1059 bio->bi_iter.bi_sector = offset; 1060 bch2_bio_map(bio, buf->data, sectors_read << 9); 1061 1062 u64 submit_time = local_clock(); 1063 ret = submit_bio_wait(bio); 1064 kfree(bio); 1065 1066 if (!ret && bch2_meta_read_fault("journal")) 1067 ret = -BCH_ERR_EIO_fault_injected; 1068 1069 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1070 submit_time, !ret); 1071 1072 if (ret) { 1073 bch_err_dev_ratelimited(ca, 1074 "journal read error: sector %llu", offset); 1075 /* 1076 * We don't error out of the recovery process 1077 * here, since the relevant journal entry may be 1078 * found on a different device, and missing or 1079 * no journal entries will be handled later 1080 */ 1081 goto out; 1082 } 1083 1084 j = buf->data; 1085 } 1086 1087 ret = jset_validate_early(c, ca, j, offset, 1088 end - offset, sectors_read); 1089 switch (ret) { 1090 case 0: 1091 sectors = vstruct_sectors(j, c->block_bits); 1092 break; 1093 case JOURNAL_ENTRY_REREAD: 1094 if (vstruct_bytes(j) > buf->size) { 1095 ret = journal_read_buf_realloc(buf, 1096 vstruct_bytes(j)); 1097 if (ret) 1098 goto err; 1099 } 1100 goto reread; 1101 case JOURNAL_ENTRY_NONE: 1102 if (!saw_bad) 1103 goto out; 1104 /* 1105 * On checksum error we don't really trust the size 1106 * field of the journal entry we read, so try reading 1107 * again at next block boundary: 1108 */ 1109 sectors = block_sectors(c); 1110 goto next_block; 1111 default: 1112 goto err; 1113 } 1114 1115 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1116 ja->highest_seq_found = le64_to_cpu(j->seq); 1117 ja->cur_idx = bucket; 1118 ja->sectors_free = ca->mi.bucket_size - 1119 bucket_remainder(ca, offset) - sectors; 1120 } 1121 1122 /* 1123 * This happens sometimes if we don't have discards on - 1124 * when we've partially overwritten a bucket with new 1125 * journal entries. We don't need the rest of the 1126 * bucket: 1127 */ 1128 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1129 goto out; 1130 1131 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1132 1133 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1134 struct bch_csum csum; 1135 csum_good = jset_csum_good(c, j, &csum); 1136 1137 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 1138 1139 if (!csum_good) { 1140 bch_err_dev_ratelimited(ca, "%s", 1141 (printbuf_reset(&err), 1142 prt_str(&err, "journal "), 1143 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1144 err.buf)); 1145 saw_bad = true; 1146 } 1147 1148 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1149 j->encrypted_start, 1150 vstruct_end(j) - (void *) j->encrypted_start); 1151 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1152 1153 mutex_lock(&jlist->lock); 1154 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1155 .csum_good = csum_good, 1156 .dev = ca->dev_idx, 1157 .bucket = bucket, 1158 .bucket_offset = offset - 1159 bucket_to_sector(ca, ja->buckets[bucket]), 1160 .sector = offset, 1161 }, jlist, j); 1162 mutex_unlock(&jlist->lock); 1163 1164 switch (ret) { 1165 case JOURNAL_ENTRY_ADD_OK: 1166 break; 1167 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1168 break; 1169 default: 1170 goto err; 1171 } 1172 next_block: 1173 pr_debug("next"); 1174 offset += sectors; 1175 sectors_read -= sectors; 1176 j = ((void *) j) + (sectors << 9); 1177 } 1178 1179 out: 1180 ret = 0; 1181 err: 1182 printbuf_exit(&err); 1183 return ret; 1184 } 1185 1186 static CLOSURE_CALLBACK(bch2_journal_read_device) 1187 { 1188 closure_type(ja, struct journal_device, read); 1189 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1190 struct bch_fs *c = ca->fs; 1191 struct journal_list *jlist = 1192 container_of(cl->parent, struct journal_list, cl); 1193 struct journal_read_buf buf = { NULL, 0 }; 1194 unsigned i; 1195 int ret = 0; 1196 1197 if (!ja->nr) 1198 goto out; 1199 1200 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1201 if (ret) 1202 goto err; 1203 1204 pr_debug("%u journal buckets", ja->nr); 1205 1206 for (i = 0; i < ja->nr; i++) { 1207 ret = journal_read_bucket(ca, &buf, jlist, i); 1208 if (ret) 1209 goto err; 1210 } 1211 1212 /* 1213 * Set dirty_idx to indicate the entire journal is full and needs to be 1214 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1215 * pinned when it first runs: 1216 */ 1217 ja->discard_idx = ja->dirty_idx_ondisk = 1218 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1219 out: 1220 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1221 kvfree(buf.data); 1222 percpu_ref_put(&ca->io_ref[READ]); 1223 closure_return(cl); 1224 return; 1225 err: 1226 mutex_lock(&jlist->lock); 1227 jlist->ret = ret; 1228 mutex_unlock(&jlist->lock); 1229 goto out; 1230 } 1231 1232 int bch2_journal_read(struct bch_fs *c, 1233 u64 *last_seq, 1234 u64 *blacklist_seq, 1235 u64 *start_seq) 1236 { 1237 struct journal_list jlist; 1238 struct journal_replay *i, **_i, *prev = NULL; 1239 struct genradix_iter radix_iter; 1240 struct printbuf buf = PRINTBUF; 1241 bool degraded = false, last_write_torn = false; 1242 u64 seq; 1243 int ret = 0; 1244 1245 closure_init_stack(&jlist.cl); 1246 mutex_init(&jlist.lock); 1247 jlist.last_seq = 0; 1248 jlist.ret = 0; 1249 1250 for_each_member_device(c, ca) { 1251 if (!c->opts.fsck && 1252 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1253 continue; 1254 1255 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1256 ca->mi.state == BCH_MEMBER_STATE_ro) && 1257 percpu_ref_tryget(&ca->io_ref[READ])) 1258 closure_call(&ca->journal.read, 1259 bch2_journal_read_device, 1260 system_unbound_wq, 1261 &jlist.cl); 1262 else 1263 degraded = true; 1264 } 1265 1266 while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) 1267 ; 1268 1269 if (jlist.ret) 1270 return jlist.ret; 1271 1272 *last_seq = 0; 1273 *start_seq = 0; 1274 *blacklist_seq = 0; 1275 1276 /* 1277 * Find most recent flush entry, and ignore newer non flush entries - 1278 * those entries will be blacklisted: 1279 */ 1280 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1281 i = *_i; 1282 1283 if (journal_replay_ignore(i)) 1284 continue; 1285 1286 if (!*start_seq) 1287 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1288 1289 if (JSET_NO_FLUSH(&i->j)) { 1290 i->ignore_blacklisted = true; 1291 continue; 1292 } 1293 1294 if (!last_write_torn && !i->csum_good) { 1295 last_write_torn = true; 1296 i->ignore_blacklisted = true; 1297 continue; 1298 } 1299 1300 struct bkey_validate_context from = { 1301 .from = BKEY_VALIDATE_journal, 1302 .journal_seq = le64_to_cpu(i->j.seq), 1303 }; 1304 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1305 c, le32_to_cpu(i->j.version), &i->j, NULL, 1306 jset_last_seq_newer_than_seq, 1307 "invalid journal entry: last_seq > seq (%llu > %llu)", 1308 le64_to_cpu(i->j.last_seq), 1309 le64_to_cpu(i->j.seq))) 1310 i->j.last_seq = i->j.seq; 1311 1312 *last_seq = le64_to_cpu(i->j.last_seq); 1313 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1314 break; 1315 } 1316 1317 if (!*start_seq) { 1318 bch_info(c, "journal read done, but no entries found"); 1319 return 0; 1320 } 1321 1322 if (!*last_seq) { 1323 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1324 "journal read done, but no entries found after dropping non-flushes"); 1325 return 0; 1326 } 1327 1328 bch_info(c, "journal read done, replaying entries %llu-%llu", 1329 *last_seq, *blacklist_seq - 1); 1330 1331 if (*start_seq != *blacklist_seq) 1332 bch_info(c, "dropped unflushed entries %llu-%llu", 1333 *blacklist_seq, *start_seq - 1); 1334 1335 /* Drop blacklisted entries and entries older than last_seq: */ 1336 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1337 i = *_i; 1338 1339 if (journal_replay_ignore(i)) 1340 continue; 1341 1342 seq = le64_to_cpu(i->j.seq); 1343 if (seq < *last_seq) { 1344 journal_replay_free(c, i, false); 1345 continue; 1346 } 1347 1348 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1349 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1350 jset_seq_blacklisted, 1351 "found blacklisted journal entry %llu", seq); 1352 i->ignore_blacklisted = true; 1353 } 1354 } 1355 1356 /* Check for missing entries: */ 1357 seq = *last_seq; 1358 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1359 i = *_i; 1360 1361 if (journal_replay_ignore(i)) 1362 continue; 1363 1364 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1365 1366 while (seq < le64_to_cpu(i->j.seq)) { 1367 u64 missing_start, missing_end; 1368 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1369 1370 while (seq < le64_to_cpu(i->j.seq) && 1371 bch2_journal_seq_is_blacklisted(c, seq, false)) 1372 seq++; 1373 1374 if (seq == le64_to_cpu(i->j.seq)) 1375 break; 1376 1377 missing_start = seq; 1378 1379 while (seq < le64_to_cpu(i->j.seq) && 1380 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1381 seq++; 1382 1383 if (prev) { 1384 bch2_journal_ptrs_to_text(&buf1, c, prev); 1385 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1386 } else 1387 prt_printf(&buf1, "(none)"); 1388 bch2_journal_ptrs_to_text(&buf2, c, i); 1389 1390 missing_end = seq - 1; 1391 fsck_err(c, journal_entries_missing, 1392 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1393 "prev at %s\n" 1394 "next at %s, continue?", 1395 missing_start, missing_end, 1396 *last_seq, *blacklist_seq - 1, 1397 buf1.buf, buf2.buf); 1398 1399 printbuf_exit(&buf1); 1400 printbuf_exit(&buf2); 1401 } 1402 1403 prev = i; 1404 seq++; 1405 } 1406 1407 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1408 struct bch_replicas_padded replicas = { 1409 .e.data_type = BCH_DATA_journal, 1410 .e.nr_devs = 0, 1411 .e.nr_required = 1, 1412 }; 1413 1414 i = *_i; 1415 if (journal_replay_ignore(i)) 1416 continue; 1417 1418 darray_for_each(i->ptrs, ptr) { 1419 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1420 1421 if (!ptr->csum_good) 1422 bch_err_dev_offset(ca, ptr->sector, 1423 "invalid journal checksum, seq %llu%s", 1424 le64_to_cpu(i->j.seq), 1425 i->csum_good ? " (had good copy on another device)" : ""); 1426 } 1427 1428 ret = jset_validate(c, 1429 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1430 &i->j, 1431 i->ptrs.data[0].sector, 1432 READ); 1433 if (ret) 1434 goto err; 1435 1436 darray_for_each(i->ptrs, ptr) 1437 replicas_entry_add_dev(&replicas.e, ptr->dev); 1438 1439 bch2_replicas_entry_sort(&replicas.e); 1440 1441 printbuf_reset(&buf); 1442 bch2_replicas_entry_to_text(&buf, &replicas.e); 1443 1444 if (!degraded && 1445 !bch2_replicas_marked(c, &replicas.e) && 1446 (le64_to_cpu(i->j.seq) == *last_seq || 1447 fsck_err(c, journal_entry_replicas_not_marked, 1448 "superblock not marked as containing replicas for journal entry %llu\n%s", 1449 le64_to_cpu(i->j.seq), buf.buf))) { 1450 ret = bch2_mark_replicas(c, &replicas.e); 1451 if (ret) 1452 goto err; 1453 } 1454 } 1455 err: 1456 fsck_err: 1457 printbuf_exit(&buf); 1458 return ret; 1459 } 1460 1461 /* journal write: */ 1462 1463 static void journal_advance_devs_to_next_bucket(struct journal *j, 1464 struct dev_alloc_list *devs, 1465 unsigned sectors, __le64 seq) 1466 { 1467 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1468 1469 darray_for_each(*devs, i) { 1470 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1471 if (!ca) 1472 continue; 1473 1474 struct journal_device *ja = &ca->journal; 1475 1476 if (sectors > ja->sectors_free && 1477 sectors <= ca->mi.bucket_size && 1478 bch2_journal_dev_buckets_available(j, ja, 1479 journal_space_discarded)) { 1480 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1481 ja->sectors_free = ca->mi.bucket_size; 1482 1483 /* 1484 * ja->bucket_seq[ja->cur_idx] must always have 1485 * something sensible: 1486 */ 1487 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1488 } 1489 } 1490 } 1491 1492 static void __journal_write_alloc(struct journal *j, 1493 struct journal_buf *w, 1494 struct dev_alloc_list *devs, 1495 unsigned sectors, 1496 unsigned *replicas, 1497 unsigned replicas_want) 1498 { 1499 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1500 1501 darray_for_each(*devs, i) { 1502 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1503 if (!ca) 1504 continue; 1505 1506 struct journal_device *ja = &ca->journal; 1507 1508 /* 1509 * Check that we can use this device, and aren't already using 1510 * it: 1511 */ 1512 if (!ca->mi.durability || 1513 ca->mi.state != BCH_MEMBER_STATE_rw || 1514 !ja->nr || 1515 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1516 sectors > ja->sectors_free) 1517 continue; 1518 1519 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1520 1521 bch2_bkey_append_ptr(&w->key, 1522 (struct bch_extent_ptr) { 1523 .offset = bucket_to_sector(ca, 1524 ja->buckets[ja->cur_idx]) + 1525 ca->mi.bucket_size - 1526 ja->sectors_free, 1527 .dev = ca->dev_idx, 1528 }); 1529 1530 ja->sectors_free -= sectors; 1531 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1532 1533 *replicas += ca->mi.durability; 1534 1535 if (*replicas >= replicas_want) 1536 break; 1537 } 1538 } 1539 1540 /** 1541 * journal_write_alloc - decide where to write next journal entry 1542 * 1543 * @j: journal object 1544 * @w: journal buf (entry to be written) 1545 * 1546 * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure 1547 */ 1548 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1549 { 1550 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1551 struct bch_devs_mask devs; 1552 struct dev_alloc_list devs_sorted; 1553 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1554 unsigned target = c->opts.metadata_target ?: 1555 c->opts.foreground_target; 1556 unsigned replicas = 0, replicas_want = 1557 READ_ONCE(c->opts.metadata_replicas); 1558 unsigned replicas_need = min_t(unsigned, replicas_want, 1559 READ_ONCE(c->opts.metadata_replicas_required)); 1560 bool advance_done = false; 1561 1562 rcu_read_lock(); 1563 1564 /* We might run more than once if we have to stop and do discards: */ 1565 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); 1566 bkey_for_each_ptr(ptrs, p) { 1567 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); 1568 if (ca) 1569 replicas += ca->mi.durability; 1570 } 1571 1572 retry_target: 1573 devs = target_rw_devs(c, BCH_DATA_journal, target); 1574 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1575 retry_alloc: 1576 __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); 1577 1578 if (likely(replicas >= replicas_want)) 1579 goto done; 1580 1581 if (!advance_done) { 1582 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1583 advance_done = true; 1584 goto retry_alloc; 1585 } 1586 1587 if (replicas < replicas_want && target) { 1588 /* Retry from all devices: */ 1589 target = 0; 1590 advance_done = false; 1591 goto retry_target; 1592 } 1593 done: 1594 rcu_read_unlock(); 1595 1596 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1597 1598 return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1599 } 1600 1601 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1602 { 1603 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1604 1605 /* we aren't holding j->lock: */ 1606 unsigned new_size = READ_ONCE(j->buf_size_want); 1607 void *new_buf; 1608 1609 if (buf->buf_size >= new_size) 1610 return; 1611 1612 size_t btree_write_buffer_size = new_size / 64; 1613 1614 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1615 return; 1616 1617 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1618 if (!new_buf) 1619 return; 1620 1621 memcpy(new_buf, buf->data, buf->buf_size); 1622 1623 spin_lock(&j->lock); 1624 swap(buf->data, new_buf); 1625 swap(buf->buf_size, new_size); 1626 spin_unlock(&j->lock); 1627 1628 kvfree(new_buf); 1629 } 1630 1631 static CLOSURE_CALLBACK(journal_write_done) 1632 { 1633 closure_type(w, struct journal_buf, io); 1634 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1635 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1636 struct bch_replicas_padded replicas; 1637 u64 seq = le64_to_cpu(w->data->seq); 1638 int err = 0; 1639 1640 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1641 ? j->flush_write_time 1642 : j->noflush_write_time, j->write_start_time); 1643 1644 if (!w->devs_written.nr) { 1645 if (!bch2_journal_error(j)) 1646 bch_err(c, "unable to write journal to sufficient devices"); 1647 err = -BCH_ERR_journal_write_err; 1648 } else { 1649 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1650 w->devs_written); 1651 err = bch2_mark_replicas(c, &replicas.e); 1652 } 1653 1654 if (err) 1655 bch2_fatal_error(c); 1656 1657 closure_debug_destroy(cl); 1658 1659 spin_lock(&j->lock); 1660 if (seq >= j->pin.front) 1661 journal_seq_pin(j, seq)->devs = w->devs_written; 1662 if (err && (!j->err_seq || seq < j->err_seq)) 1663 j->err_seq = seq; 1664 w->write_done = true; 1665 1666 if (!j->free_buf || j->free_buf_size < w->buf_size) { 1667 swap(j->free_buf, w->data); 1668 swap(j->free_buf_size, w->buf_size); 1669 } 1670 1671 if (w->data) { 1672 void *buf = w->data; 1673 w->data = NULL; 1674 w->buf_size = 0; 1675 1676 spin_unlock(&j->lock); 1677 kvfree(buf); 1678 spin_lock(&j->lock); 1679 } 1680 1681 bool completed = false; 1682 bool do_discards = false; 1683 1684 for (seq = journal_last_unwritten_seq(j); 1685 seq <= journal_cur_seq(j); 1686 seq++) { 1687 w = j->buf + (seq & JOURNAL_BUF_MASK); 1688 if (!w->write_done) 1689 break; 1690 1691 if (!j->err_seq && !w->noflush) { 1692 j->flushed_seq_ondisk = seq; 1693 j->last_seq_ondisk = w->last_seq; 1694 1695 closure_wake_up(&c->freelist_wait); 1696 bch2_reset_alloc_cursors(c); 1697 } 1698 1699 j->seq_ondisk = seq; 1700 1701 /* 1702 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1703 * more buckets: 1704 * 1705 * Must come before signaling write completion, for 1706 * bch2_fs_journal_stop(): 1707 */ 1708 if (j->watermark != BCH_WATERMARK_stripe) 1709 journal_reclaim_kick(&c->journal); 1710 1711 closure_wake_up(&w->wait); 1712 completed = true; 1713 } 1714 1715 if (completed) { 1716 bch2_journal_reclaim_fast(j); 1717 bch2_journal_space_available(j); 1718 1719 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1720 1721 journal_wake(j); 1722 } 1723 1724 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1725 j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1726 struct journal_buf *buf = journal_cur_buf(j); 1727 long delta = buf->expires - jiffies; 1728 1729 /* 1730 * We don't close a journal entry to write it while there's 1731 * previous entries still in flight - the current journal entry 1732 * might want to be written now: 1733 */ 1734 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1735 } 1736 1737 /* 1738 * We don't typically trigger journal writes from her - the next journal 1739 * write will be triggered immediately after the previous one is 1740 * allocated, in bch2_journal_write() - but the journal write error path 1741 * is special: 1742 */ 1743 bch2_journal_do_writes(j); 1744 spin_unlock(&j->lock); 1745 1746 if (do_discards) 1747 bch2_do_discards(c); 1748 } 1749 1750 static void journal_write_endio(struct bio *bio) 1751 { 1752 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1753 struct bch_dev *ca = jbio->ca; 1754 struct journal *j = &ca->fs->journal; 1755 struct journal_buf *w = j->buf + jbio->buf_idx; 1756 1757 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, 1758 jbio->submit_time, !bio->bi_status); 1759 1760 if (bio->bi_status) { 1761 bch_err_dev_ratelimited(ca, 1762 "error writing journal entry %llu: %s", 1763 le64_to_cpu(w->data->seq), 1764 bch2_blk_status_to_str(bio->bi_status)); 1765 1766 unsigned long flags; 1767 spin_lock_irqsave(&j->err_lock, flags); 1768 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1769 spin_unlock_irqrestore(&j->err_lock, flags); 1770 } 1771 1772 closure_put(&w->io); 1773 percpu_ref_put(&ca->io_ref[WRITE]); 1774 } 1775 1776 static CLOSURE_CALLBACK(journal_write_submit) 1777 { 1778 closure_type(w, struct journal_buf, io); 1779 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1780 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1781 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1782 1783 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1784 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1785 if (!ca) { 1786 /* XXX: fix this */ 1787 bch_err(c, "missing device %u for journal write", ptr->dev); 1788 continue; 1789 } 1790 1791 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1792 sectors); 1793 1794 struct journal_device *ja = &ca->journal; 1795 struct journal_bio *jbio = ja->bio[w->idx]; 1796 struct bio *bio = &jbio->bio; 1797 1798 jbio->submit_time = local_clock(); 1799 1800 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1801 bio->bi_iter.bi_sector = ptr->offset; 1802 bio->bi_end_io = journal_write_endio; 1803 bio->bi_private = ca; 1804 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1805 1806 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1807 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1808 1809 if (!JSET_NO_FLUSH(w->data)) 1810 bio->bi_opf |= REQ_FUA; 1811 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1812 bio->bi_opf |= REQ_PREFLUSH; 1813 1814 bch2_bio_map(bio, w->data, sectors << 9); 1815 1816 trace_and_count(c, journal_write, bio); 1817 closure_bio_submit(bio, cl); 1818 1819 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1820 } 1821 1822 continue_at(cl, journal_write_done, j->wq); 1823 } 1824 1825 static CLOSURE_CALLBACK(journal_write_preflush) 1826 { 1827 closure_type(w, struct journal_buf, io); 1828 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1829 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1830 1831 /* 1832 * Wait for previous journal writes to comelete; they won't necessarily 1833 * be flushed if they're still in flight 1834 */ 1835 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1836 spin_lock(&j->lock); 1837 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1838 closure_wait(&j->async_wait, cl); 1839 spin_unlock(&j->lock); 1840 continue_at(cl, journal_write_preflush, j->wq); 1841 return; 1842 } 1843 spin_unlock(&j->lock); 1844 } 1845 1846 if (w->separate_flush) { 1847 for_each_rw_member(c, ca) { 1848 percpu_ref_get(&ca->io_ref[WRITE]); 1849 1850 struct journal_device *ja = &ca->journal; 1851 struct bio *bio = &ja->bio[w->idx]->bio; 1852 bio_reset(bio, ca->disk_sb.bdev, 1853 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1854 bio->bi_end_io = journal_write_endio; 1855 bio->bi_private = ca; 1856 closure_bio_submit(bio, cl); 1857 } 1858 1859 continue_at(cl, journal_write_submit, j->wq); 1860 } else { 1861 /* 1862 * no need to punt to another work item if we're not waiting on 1863 * preflushes 1864 */ 1865 journal_write_submit(&cl->work); 1866 } 1867 } 1868 1869 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1870 { 1871 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1872 struct jset_entry *start, *end; 1873 struct jset *jset = w->data; 1874 struct journal_keys_to_wb wb = { NULL }; 1875 unsigned sectors, bytes, u64s; 1876 unsigned long btree_roots_have = 0; 1877 bool validate_before_checksum = false; 1878 u64 seq = le64_to_cpu(jset->seq); 1879 int ret; 1880 1881 /* 1882 * Simple compaction, dropping empty jset_entries (from journal 1883 * reservations that weren't fully used) and merging jset_entries that 1884 * can be. 1885 * 1886 * If we wanted to be really fancy here, we could sort all the keys in 1887 * the jset and drop keys that were overwritten - probably not worth it: 1888 */ 1889 vstruct_for_each(jset, i) { 1890 unsigned u64s = le16_to_cpu(i->u64s); 1891 1892 /* Empty entry: */ 1893 if (!u64s) 1894 continue; 1895 1896 /* 1897 * New btree roots are set by journalling them; when the journal 1898 * entry gets written we have to propagate them to 1899 * c->btree_roots 1900 * 1901 * But, every journal entry we write has to contain all the 1902 * btree roots (at least for now); so after we copy btree roots 1903 * to c->btree_roots we have to get any missing btree roots and 1904 * add them to this journal entry: 1905 */ 1906 switch (i->type) { 1907 case BCH_JSET_ENTRY_btree_root: 1908 bch2_journal_entry_to_btree_root(c, i); 1909 __set_bit(i->btree_id, &btree_roots_have); 1910 break; 1911 case BCH_JSET_ENTRY_write_buffer_keys: 1912 EBUG_ON(!w->need_flush_to_write_buffer); 1913 1914 if (!wb.wb) 1915 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1916 1917 jset_entry_for_each_key(i, k) { 1918 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1919 if (ret) { 1920 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1921 bch2_err_str(ret)); 1922 bch2_journal_keys_to_write_buffer_end(c, &wb); 1923 return ret; 1924 } 1925 } 1926 i->type = BCH_JSET_ENTRY_btree_keys; 1927 break; 1928 } 1929 } 1930 1931 if (wb.wb) { 1932 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1933 if (ret) { 1934 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1935 bch2_err_str(ret)); 1936 return ret; 1937 } 1938 } 1939 1940 spin_lock(&c->journal.lock); 1941 w->need_flush_to_write_buffer = false; 1942 spin_unlock(&c->journal.lock); 1943 1944 start = end = vstruct_last(jset); 1945 1946 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1947 1948 struct jset_entry_datetime *d = 1949 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1950 d->entry.type = BCH_JSET_ENTRY_datetime; 1951 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1952 1953 bch2_journal_super_entries_add_common(c, &end, seq); 1954 u64s = (u64 *) end - (u64 *) start; 1955 1956 WARN_ON(u64s > j->entry_u64s_reserved); 1957 1958 le32_add_cpu(&jset->u64s, u64s); 1959 1960 sectors = vstruct_sectors(jset, c->block_bits); 1961 bytes = vstruct_bytes(jset); 1962 1963 if (sectors > w->sectors) { 1964 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1965 vstruct_bytes(jset), w->sectors << 9, 1966 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1967 return -EINVAL; 1968 } 1969 1970 jset->magic = cpu_to_le64(jset_magic(c)); 1971 jset->version = cpu_to_le32(c->sb.version); 1972 1973 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1974 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1975 1976 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1977 j->last_empty_seq = seq; 1978 1979 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1980 validate_before_checksum = true; 1981 1982 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1983 validate_before_checksum = true; 1984 1985 if (validate_before_checksum && 1986 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1987 return ret; 1988 1989 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1990 jset->encrypted_start, 1991 vstruct_end(jset) - (void *) jset->encrypted_start); 1992 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1993 return ret; 1994 1995 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1996 journal_nonce(jset), jset); 1997 1998 if (!validate_before_checksum && 1999 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 2000 return ret; 2001 2002 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 2003 return 0; 2004 } 2005 2006 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 2007 { 2008 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2009 int error = bch2_journal_error(j); 2010 2011 /* 2012 * If the journal is in an error state - we did an emergency shutdown - 2013 * we prefer to continue doing journal writes. We just mark them as 2014 * noflush so they'll never be used, but they'll still be visible by the 2015 * list_journal tool - this helps in debugging. 2016 * 2017 * There's a caveat: the first journal write after marking the 2018 * superblock dirty must always be a flush write, because on startup 2019 * from a clean shutdown we didn't necessarily read the journal and the 2020 * new journal write might overwrite whatever was in the journal 2021 * previously - we can't leave the journal without any flush writes in 2022 * it. 2023 * 2024 * So if we're in an error state, and we're still starting up, we don't 2025 * write anything at all. 2026 */ 2027 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 2028 return error; 2029 2030 if (error || 2031 w->noflush || 2032 (!w->must_flush && 2033 time_before(jiffies, j->last_flush_write + 2034 msecs_to_jiffies(c->opts.journal_flush_delay)) && 2035 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 2036 w->noflush = true; 2037 SET_JSET_NO_FLUSH(w->data, true); 2038 w->data->last_seq = 0; 2039 w->last_seq = 0; 2040 2041 j->nr_noflush_writes++; 2042 } else { 2043 w->must_flush = true; 2044 j->last_flush_write = jiffies; 2045 j->nr_flush_writes++; 2046 clear_bit(JOURNAL_need_flush_write, &j->flags); 2047 } 2048 2049 return 0; 2050 } 2051 2052 CLOSURE_CALLBACK(bch2_journal_write) 2053 { 2054 closure_type(w, struct journal_buf, io); 2055 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2056 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2057 struct bch_replicas_padded replicas; 2058 unsigned nr_rw_members = 0; 2059 int ret; 2060 2061 for_each_rw_member(c, ca) 2062 nr_rw_members++; 2063 2064 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2065 BUG_ON(!w->write_started); 2066 BUG_ON(w->write_allocated); 2067 BUG_ON(w->write_done); 2068 2069 j->write_start_time = local_clock(); 2070 2071 spin_lock(&j->lock); 2072 if (nr_rw_members > 1) 2073 w->separate_flush = true; 2074 2075 ret = bch2_journal_write_pick_flush(j, w); 2076 spin_unlock(&j->lock); 2077 if (ret) 2078 goto err; 2079 2080 mutex_lock(&j->buf_lock); 2081 journal_buf_realloc(j, w); 2082 2083 ret = bch2_journal_write_prep(j, w); 2084 mutex_unlock(&j->buf_lock); 2085 if (ret) 2086 goto err; 2087 2088 j->entry_bytes_written += vstruct_bytes(w->data); 2089 2090 while (1) { 2091 spin_lock(&j->lock); 2092 ret = journal_write_alloc(j, w); 2093 if (!ret || !j->can_discard) 2094 break; 2095 2096 spin_unlock(&j->lock); 2097 bch2_journal_do_discards(j); 2098 } 2099 2100 if (ret && !bch2_journal_error(j)) { 2101 struct printbuf buf = PRINTBUF; 2102 buf.atomic++; 2103 2104 __bch2_journal_debug_to_text(&buf, j); 2105 spin_unlock(&j->lock); 2106 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2107 le64_to_cpu(w->data->seq), 2108 vstruct_sectors(w->data, c->block_bits), 2109 bch2_err_str(ret)); 2110 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2111 printbuf_exit(&buf); 2112 } 2113 if (ret) 2114 goto err; 2115 2116 /* 2117 * write is allocated, no longer need to account for it in 2118 * bch2_journal_space_available(): 2119 */ 2120 w->sectors = 0; 2121 w->write_allocated = true; 2122 2123 /* 2124 * journal entry has been compacted and allocated, recalculate space 2125 * available: 2126 */ 2127 bch2_journal_space_available(j); 2128 bch2_journal_do_writes(j); 2129 spin_unlock(&j->lock); 2130 2131 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2132 2133 if (c->opts.nochanges) 2134 goto no_io; 2135 2136 /* 2137 * Mark journal replicas before we submit the write to guarantee 2138 * recovery will find the journal entries after a crash. 2139 */ 2140 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2141 w->devs_written); 2142 ret = bch2_mark_replicas(c, &replicas.e); 2143 if (ret) 2144 goto err; 2145 2146 if (!JSET_NO_FLUSH(w->data)) 2147 continue_at(cl, journal_write_preflush, j->wq); 2148 else 2149 continue_at(cl, journal_write_submit, j->wq); 2150 return; 2151 no_io: 2152 continue_at(cl, journal_write_done, j->wq); 2153 return; 2154 err: 2155 bch2_fatal_error(c); 2156 continue_at(cl, journal_write_done, j->wq); 2157 } 2158