1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_foreground.h" 5 #include "bkey_buf.h" 6 #include "bkey_methods.h" 7 #include "btree_cache.h" 8 #include "btree_gc.h" 9 #include "btree_journal_iter.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_io.h" 13 #include "btree_iter.h" 14 #include "btree_locking.h" 15 #include "buckets.h" 16 #include "clock.h" 17 #include "error.h" 18 #include "extents.h" 19 #include "io_write.h" 20 #include "journal.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "recovery_passes.h" 24 #include "replicas.h" 25 #include "sb-members.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/random.h> 30 31 static const char * const bch2_btree_update_modes[] = { 32 #define x(t) #t, 33 BTREE_UPDATE_MODES() 34 #undef x 35 NULL 36 }; 37 38 static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); 39 40 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, 41 btree_path_idx_t, struct btree *, struct keylist *); 42 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); 43 44 /* 45 * Verify that child nodes correctly span parent node's range: 46 */ 47 int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) 48 { 49 struct bch_fs *c = trans->c; 50 struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 51 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key 52 : b->data->min_key; 53 struct btree_and_journal_iter iter; 54 struct bkey_s_c k; 55 struct printbuf buf = PRINTBUF; 56 struct bkey_buf prev; 57 int ret = 0; 58 59 printbuf_indent_add_nextline(&buf, 2); 60 61 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && 62 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, 63 b->data->min_key)); 64 65 bch2_bkey_buf_init(&prev); 66 bkey_init(&prev.k->k); 67 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 68 69 if (b == btree_node_root(c, b)) { 70 if (!bpos_eq(b->data->min_key, POS_MIN)) { 71 ret = __bch2_topology_error(c, &buf); 72 73 bch2_bpos_to_text(&buf, b->data->min_key); 74 log_fsck_err(trans, btree_root_bad_min_key, 75 "btree root with incorrect min_key: %s", buf.buf); 76 goto out; 77 } 78 79 if (!bpos_eq(b->data->max_key, SPOS_MAX)) { 80 ret = __bch2_topology_error(c, &buf); 81 bch2_bpos_to_text(&buf, b->data->max_key); 82 log_fsck_err(trans, btree_root_bad_max_key, 83 "btree root with incorrect max_key: %s", buf.buf); 84 goto out; 85 } 86 } 87 88 if (!b->c.level) 89 goto out; 90 91 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 92 if (k.k->type != KEY_TYPE_btree_ptr_v2) 93 goto out; 94 95 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); 96 97 struct bpos expected_min = bkey_deleted(&prev.k->k) 98 ? node_min 99 : bpos_successor(prev.k->k.p); 100 101 if (!bpos_eq(expected_min, bp.v->min_key)) { 102 ret = __bch2_topology_error(c, &buf); 103 104 prt_str(&buf, "end of prev node doesn't match start of next node\nin "); 105 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 106 prt_str(&buf, " node "); 107 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 108 prt_str(&buf, "\nprev "); 109 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); 110 prt_str(&buf, "\nnext "); 111 bch2_bkey_val_to_text(&buf, c, k); 112 113 log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); 114 goto out; 115 } 116 117 bch2_bkey_buf_reassemble(&prev, c, k); 118 bch2_btree_and_journal_iter_advance(&iter); 119 } 120 121 if (bkey_deleted(&prev.k->k)) { 122 ret = __bch2_topology_error(c, &buf); 123 124 prt_str(&buf, "empty interior node\nin "); 125 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 126 prt_str(&buf, " node "); 127 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 128 129 log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); 130 } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { 131 ret = __bch2_topology_error(c, &buf); 132 133 prt_str(&buf, "last child node doesn't end at end of parent node\nin "); 134 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 135 prt_str(&buf, " node "); 136 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 137 prt_str(&buf, "\nlast key "); 138 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); 139 140 log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); 141 } 142 out: 143 fsck_err: 144 bch2_btree_and_journal_iter_exit(&iter); 145 bch2_bkey_buf_exit(&prev, c); 146 printbuf_exit(&buf); 147 return ret; 148 } 149 150 /* Calculate ideal packed bkey format for new btree nodes: */ 151 152 static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) 153 { 154 struct bkey_packed *k; 155 struct bkey uk; 156 157 for_each_bset(b, t) 158 bset_tree_for_each_key(b, t, k) 159 if (!bkey_deleted(k)) { 160 uk = bkey_unpack_key(b, k); 161 bch2_bkey_format_add_key(s, &uk); 162 } 163 } 164 165 static struct bkey_format bch2_btree_calc_format(struct btree *b) 166 { 167 struct bkey_format_state s; 168 169 bch2_bkey_format_init(&s); 170 bch2_bkey_format_add_pos(&s, b->data->min_key); 171 bch2_bkey_format_add_pos(&s, b->data->max_key); 172 __bch2_btree_calc_format(&s, b); 173 174 return bch2_bkey_format_done(&s); 175 } 176 177 static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, 178 struct bkey_format *old_f, 179 struct bkey_format *new_f) 180 { 181 /* stupid integer promotion rules */ 182 ssize_t delta = 183 (((int) new_f->key_u64s - old_f->key_u64s) * 184 (int) nr.packed_keys) + 185 (((int) new_f->key_u64s - BKEY_U64s) * 186 (int) nr.unpacked_keys); 187 188 BUG_ON(delta + nr.live_u64s < 0); 189 190 return nr.live_u64s + delta; 191 } 192 193 /** 194 * bch2_btree_node_format_fits - check if we could rewrite node with a new format 195 * 196 * @c: filesystem handle 197 * @b: btree node to rewrite 198 * @nr: number of keys for new node (i.e. b->nr) 199 * @new_f: bkey format to translate keys to 200 * 201 * Returns: true if all re-packed keys will be able to fit in a new node. 202 * 203 * Assumes all keys will successfully pack with the new format. 204 */ 205 static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, 206 struct btree_nr_keys nr, 207 struct bkey_format *new_f) 208 { 209 size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); 210 211 return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); 212 } 213 214 /* Btree node freeing/allocation: */ 215 216 static void __btree_node_free(struct btree_trans *trans, struct btree *b) 217 { 218 struct bch_fs *c = trans->c; 219 220 trace_and_count(c, btree_node_free, trans, b); 221 222 BUG_ON(btree_node_write_blocked(b)); 223 BUG_ON(btree_node_dirty(b)); 224 BUG_ON(btree_node_need_write(b)); 225 BUG_ON(b == btree_node_root(c, b)); 226 BUG_ON(b->ob.nr); 227 BUG_ON(!list_empty(&b->write_blocked)); 228 BUG_ON(b->will_make_reachable); 229 230 clear_btree_node_noevict(b); 231 } 232 233 static void bch2_btree_node_free_inmem(struct btree_trans *trans, 234 struct btree_path *path, 235 struct btree *b) 236 { 237 struct bch_fs *c = trans->c; 238 239 bch2_btree_node_lock_write_nofail(trans, path, &b->c); 240 241 __btree_node_free(trans, b); 242 243 mutex_lock(&c->btree_cache.lock); 244 bch2_btree_node_hash_remove(&c->btree_cache, b); 245 mutex_unlock(&c->btree_cache.lock); 246 247 six_unlock_write(&b->c.lock); 248 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); 249 250 bch2_trans_node_drop(trans, b); 251 } 252 253 static void bch2_btree_node_free_never_used(struct btree_update *as, 254 struct btree_trans *trans, 255 struct btree *b) 256 { 257 struct bch_fs *c = as->c; 258 struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; 259 260 BUG_ON(!list_empty(&b->write_blocked)); 261 BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); 262 263 b->will_make_reachable = 0; 264 closure_put(&as->cl); 265 266 clear_btree_node_will_make_reachable(b); 267 clear_btree_node_accessed(b); 268 clear_btree_node_dirty_acct(c, b); 269 clear_btree_node_need_write(b); 270 271 mutex_lock(&c->btree_cache.lock); 272 __bch2_btree_node_hash_remove(&c->btree_cache, b); 273 mutex_unlock(&c->btree_cache.lock); 274 275 BUG_ON(p->nr >= ARRAY_SIZE(p->b)); 276 p->b[p->nr++] = b; 277 278 six_unlock_intent(&b->c.lock); 279 280 bch2_trans_node_drop(trans, b); 281 } 282 283 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, 284 struct disk_reservation *res, 285 struct closure *cl, 286 bool interior_node, 287 unsigned flags) 288 { 289 struct bch_fs *c = trans->c; 290 struct write_point *wp; 291 struct btree *b; 292 BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; 293 struct open_buckets obs = { .nr = 0 }; 294 struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; 295 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 296 unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim 297 ? BTREE_NODE_RESERVE 298 : 0; 299 int ret; 300 301 b = bch2_btree_node_mem_alloc(trans, interior_node); 302 if (IS_ERR(b)) 303 return b; 304 305 BUG_ON(b->ob.nr); 306 307 mutex_lock(&c->btree_reserve_cache_lock); 308 if (c->btree_reserve_cache_nr > nr_reserve) { 309 struct btree_alloc *a = 310 &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; 311 312 obs = a->ob; 313 bkey_copy(&tmp.k, &a->k); 314 mutex_unlock(&c->btree_reserve_cache_lock); 315 goto out; 316 } 317 mutex_unlock(&c->btree_reserve_cache_lock); 318 retry: 319 ret = bch2_alloc_sectors_start_trans(trans, 320 c->opts.metadata_target ?: 321 c->opts.foreground_target, 322 0, 323 writepoint_ptr(&c->btree_write_point), 324 &devs_have, 325 res->nr_replicas, 326 min(res->nr_replicas, 327 c->opts.metadata_replicas_required), 328 watermark, 0, cl, &wp); 329 if (unlikely(ret)) 330 goto err; 331 332 if (wp->sectors_free < btree_sectors(c)) { 333 struct open_bucket *ob; 334 unsigned i; 335 336 open_bucket_for_each(c, &wp->ptrs, ob, i) 337 if (ob->sectors_free < btree_sectors(c)) 338 ob->sectors_free = 0; 339 340 bch2_alloc_sectors_done(c, wp); 341 goto retry; 342 } 343 344 bkey_btree_ptr_v2_init(&tmp.k); 345 bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); 346 347 bch2_open_bucket_get(c, wp, &obs); 348 bch2_alloc_sectors_done(c, wp); 349 out: 350 bkey_copy(&b->key, &tmp.k); 351 b->ob = obs; 352 six_unlock_write(&b->c.lock); 353 six_unlock_intent(&b->c.lock); 354 355 return b; 356 err: 357 bch2_btree_node_to_freelist(c, b); 358 return ERR_PTR(ret); 359 } 360 361 static struct btree *bch2_btree_node_alloc(struct btree_update *as, 362 struct btree_trans *trans, 363 unsigned level) 364 { 365 struct bch_fs *c = as->c; 366 struct btree *b; 367 struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; 368 int ret; 369 370 BUG_ON(level >= BTREE_MAX_DEPTH); 371 BUG_ON(!p->nr); 372 373 b = p->b[--p->nr]; 374 375 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 376 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 377 378 set_btree_node_accessed(b); 379 set_btree_node_dirty_acct(c, b); 380 set_btree_node_need_write(b); 381 382 bch2_bset_init_first(b, &b->data->keys); 383 b->c.level = level; 384 b->c.btree_id = as->btree_id; 385 b->version_ondisk = c->sb.version; 386 387 memset(&b->nr, 0, sizeof(b->nr)); 388 b->data->magic = cpu_to_le64(bset_magic(c)); 389 memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); 390 b->data->flags = 0; 391 SET_BTREE_NODE_ID(b->data, as->btree_id); 392 SET_BTREE_NODE_LEVEL(b->data, level); 393 394 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 395 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); 396 397 bp->v.mem_ptr = 0; 398 bp->v.seq = b->data->keys.seq; 399 bp->v.sectors_written = 0; 400 } 401 402 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); 403 404 bch2_btree_build_aux_trees(b); 405 406 ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); 407 BUG_ON(ret); 408 409 trace_and_count(c, btree_node_alloc, trans, b); 410 bch2_increment_clock(c, btree_sectors(c), WRITE); 411 return b; 412 } 413 414 static void btree_set_min(struct btree *b, struct bpos pos) 415 { 416 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) 417 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; 418 b->data->min_key = pos; 419 } 420 421 static void btree_set_max(struct btree *b, struct bpos pos) 422 { 423 b->key.k.p = pos; 424 b->data->max_key = pos; 425 } 426 427 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, 428 struct btree_trans *trans, 429 struct btree *b) 430 { 431 struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); 432 struct bkey_format format = bch2_btree_calc_format(b); 433 434 /* 435 * The keys might expand with the new format - if they wouldn't fit in 436 * the btree node anymore, use the old format for now: 437 */ 438 if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) 439 format = b->format; 440 441 SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); 442 443 btree_set_min(n, b->data->min_key); 444 btree_set_max(n, b->data->max_key); 445 446 n->data->format = format; 447 btree_node_set_format(n, format); 448 449 bch2_btree_sort_into(as->c, n, b); 450 451 btree_node_reset_sib_u64s(n); 452 return n; 453 } 454 455 static struct btree *__btree_root_alloc(struct btree_update *as, 456 struct btree_trans *trans, unsigned level) 457 { 458 struct btree *b = bch2_btree_node_alloc(as, trans, level); 459 460 btree_set_min(b, POS_MIN); 461 btree_set_max(b, SPOS_MAX); 462 b->data->format = bch2_btree_calc_format(b); 463 464 btree_node_set_format(b, b->data->format); 465 bch2_btree_build_aux_trees(b); 466 467 return b; 468 } 469 470 static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) 471 { 472 struct bch_fs *c = as->c; 473 struct prealloc_nodes *p; 474 475 for (p = as->prealloc_nodes; 476 p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); 477 p++) { 478 while (p->nr) { 479 struct btree *b = p->b[--p->nr]; 480 481 mutex_lock(&c->btree_reserve_cache_lock); 482 483 if (c->btree_reserve_cache_nr < 484 ARRAY_SIZE(c->btree_reserve_cache)) { 485 struct btree_alloc *a = 486 &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; 487 488 a->ob = b->ob; 489 b->ob.nr = 0; 490 bkey_copy(&a->k, &b->key); 491 } else { 492 bch2_open_buckets_put(c, &b->ob); 493 } 494 495 mutex_unlock(&c->btree_reserve_cache_lock); 496 497 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 498 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 499 __btree_node_free(trans, b); 500 bch2_btree_node_to_freelist(c, b); 501 } 502 } 503 } 504 505 static int bch2_btree_reserve_get(struct btree_trans *trans, 506 struct btree_update *as, 507 unsigned nr_nodes[2], 508 unsigned flags, 509 struct closure *cl) 510 { 511 struct btree *b; 512 unsigned interior; 513 int ret = 0; 514 515 BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); 516 517 /* 518 * Protects reaping from the btree node cache and using the btree node 519 * open bucket reserve: 520 */ 521 ret = bch2_btree_cache_cannibalize_lock(trans, cl); 522 if (ret) 523 return ret; 524 525 for (interior = 0; interior < 2; interior++) { 526 struct prealloc_nodes *p = as->prealloc_nodes + interior; 527 528 while (p->nr < nr_nodes[interior]) { 529 b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, 530 interior, flags); 531 if (IS_ERR(b)) { 532 ret = PTR_ERR(b); 533 goto err; 534 } 535 536 p->b[p->nr++] = b; 537 } 538 } 539 err: 540 bch2_btree_cache_cannibalize_unlock(trans); 541 return ret; 542 } 543 544 /* Asynchronous interior node update machinery */ 545 546 static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) 547 { 548 struct bch_fs *c = as->c; 549 550 if (as->took_gc_lock) 551 up_read(&c->gc_lock); 552 as->took_gc_lock = false; 553 554 bch2_journal_pin_drop(&c->journal, &as->journal); 555 bch2_journal_pin_flush(&c->journal, &as->journal); 556 bch2_disk_reservation_put(c, &as->disk_res); 557 bch2_btree_reserve_put(as, trans); 558 559 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], 560 as->start_time); 561 562 mutex_lock(&c->btree_interior_update_lock); 563 list_del(&as->unwritten_list); 564 list_del(&as->list); 565 566 closure_debug_destroy(&as->cl); 567 mempool_free(as, &c->btree_interior_update_pool); 568 569 /* 570 * Have to do the wakeup with btree_interior_update_lock still held, 571 * since being on btree_interior_update_list is our ref on @c: 572 */ 573 closure_wake_up(&c->btree_interior_update_wait); 574 575 mutex_unlock(&c->btree_interior_update_lock); 576 } 577 578 static void btree_update_add_key(struct btree_update *as, 579 struct keylist *keys, struct btree *b) 580 { 581 struct bkey_i *k = &b->key; 582 583 BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > 584 ARRAY_SIZE(as->_old_keys)); 585 586 bkey_copy(keys->top, k); 587 bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; 588 589 bch2_keylist_push(keys); 590 } 591 592 static bool btree_update_new_nodes_marked_sb(struct btree_update *as) 593 { 594 for_each_keylist_key(&as->new_keys, k) 595 if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) 596 return false; 597 return true; 598 } 599 600 static void btree_update_new_nodes_mark_sb(struct btree_update *as) 601 { 602 struct bch_fs *c = as->c; 603 604 mutex_lock(&c->sb_lock); 605 for_each_keylist_key(&as->new_keys, k) 606 bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); 607 608 bch2_write_super(c); 609 mutex_unlock(&c->sb_lock); 610 } 611 612 /* 613 * The transactional part of an interior btree node update, where we journal the 614 * update we did to the interior node and update alloc info: 615 */ 616 static int btree_update_nodes_written_trans(struct btree_trans *trans, 617 struct btree_update *as) 618 { 619 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); 620 int ret = PTR_ERR_OR_ZERO(e); 621 if (ret) 622 return ret; 623 624 memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); 625 626 trans->journal_pin = &as->journal; 627 628 for_each_keylist_key(&as->old_keys, k) { 629 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; 630 631 ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 632 BTREE_TRIGGER_transactional); 633 if (ret) 634 return ret; 635 } 636 637 for_each_keylist_key(&as->new_keys, k) { 638 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; 639 640 ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), 641 BTREE_TRIGGER_transactional); 642 if (ret) 643 return ret; 644 } 645 646 return 0; 647 } 648 649 /* If the node has been reused, we might be reading uninitialized memory - that's fine: */ 650 static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) 651 { 652 struct btree_node *b_data = READ_ONCE(b->data); 653 654 return (b_data ? b_data->keys.seq : 0) == seq; 655 } 656 657 static void btree_update_nodes_written(struct btree_update *as) 658 { 659 struct bch_fs *c = as->c; 660 struct btree *b; 661 struct btree_trans *trans = bch2_trans_get(c); 662 u64 journal_seq = 0; 663 unsigned i; 664 int ret; 665 666 /* 667 * If we're already in an error state, it might be because a btree node 668 * was never written, and we might be trying to free that same btree 669 * node here, but it won't have been marked as allocated and we'll see 670 * spurious disk usage inconsistencies in the transactional part below 671 * if we don't skip it: 672 */ 673 ret = bch2_journal_error(&c->journal); 674 if (ret) 675 goto err; 676 677 if (!btree_update_new_nodes_marked_sb(as)) 678 btree_update_new_nodes_mark_sb(as); 679 680 /* 681 * Wait for any in flight writes to finish before we free the old nodes 682 * on disk: 683 */ 684 for (i = 0; i < as->nr_old_nodes; i++) { 685 b = as->old_nodes[i]; 686 687 if (btree_node_seq_matches(b, as->old_nodes_seq[i])) 688 wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, 689 TASK_UNINTERRUPTIBLE); 690 } 691 692 /* 693 * We did an update to a parent node where the pointers we added pointed 694 * to child nodes that weren't written yet: now, the child nodes have 695 * been written so we can write out the update to the interior node. 696 */ 697 698 /* 699 * We can't call into journal reclaim here: we'd block on the journal 700 * reclaim lock, but we may need to release the open buckets we have 701 * pinned in order for other btree updates to make forward progress, and 702 * journal reclaim does btree updates when flushing bkey_cached entries, 703 * which may require allocations as well. 704 */ 705 ret = commit_do(trans, &as->disk_res, &journal_seq, 706 BCH_WATERMARK_interior_updates| 707 BCH_TRANS_COMMIT_no_enospc| 708 BCH_TRANS_COMMIT_no_check_rw| 709 BCH_TRANS_COMMIT_journal_reclaim, 710 btree_update_nodes_written_trans(trans, as)); 711 bch2_trans_unlock(trans); 712 713 bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, 714 "%s", bch2_err_str(ret)); 715 err: 716 /* 717 * Ensure transaction is unlocked before using btree_node_lock_nopath() 718 * (the use of which is always suspect, we need to work on removing this 719 * in the future) 720 * 721 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() 722 * calls bch2_path_upgrade(), before we call path_make_mut(), so we may 723 * rarely end up with a locked path besides the one we have here: 724 */ 725 bch2_trans_unlock(trans); 726 bch2_trans_begin(trans); 727 728 /* 729 * We have to be careful because another thread might be getting ready 730 * to free as->b and calling btree_update_reparent() on us - we'll 731 * recheck under btree_update_lock below: 732 */ 733 b = READ_ONCE(as->b); 734 if (b) { 735 /* 736 * @b is the node we did the final insert into: 737 * 738 * On failure to get a journal reservation, we still have to 739 * unblock the write and allow most of the write path to happen 740 * so that shutdown works, but the i->journal_seq mechanism 741 * won't work to prevent the btree write from being visible (we 742 * didn't get a journal sequence number) - instead 743 * __bch2_btree_node_write() doesn't do the actual write if 744 * we're in journal error state: 745 */ 746 747 btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, 748 as->btree_id, b->c.level, b->key.k.p); 749 struct btree_path *path = trans->paths + path_idx; 750 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 751 mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); 752 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); 753 path->l[b->c.level].b = b; 754 755 bch2_btree_node_lock_write_nofail(trans, path, &b->c); 756 757 mutex_lock(&c->btree_interior_update_lock); 758 759 list_del(&as->write_blocked_list); 760 if (list_empty(&b->write_blocked)) 761 clear_btree_node_write_blocked(b); 762 763 /* 764 * Node might have been freed, recheck under 765 * btree_interior_update_lock: 766 */ 767 if (as->b == b) { 768 BUG_ON(!b->c.level); 769 BUG_ON(!btree_node_dirty(b)); 770 771 if (!ret) { 772 struct bset *last = btree_bset_last(b); 773 774 last->journal_seq = cpu_to_le64( 775 max(journal_seq, 776 le64_to_cpu(last->journal_seq))); 777 778 bch2_btree_add_journal_pin(c, b, journal_seq); 779 } else { 780 /* 781 * If we didn't get a journal sequence number we 782 * can't write this btree node, because recovery 783 * won't know to ignore this write: 784 */ 785 set_btree_node_never_write(b); 786 } 787 } 788 789 mutex_unlock(&c->btree_interior_update_lock); 790 791 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); 792 six_unlock_write(&b->c.lock); 793 794 btree_node_write_if_need(trans, b, SIX_LOCK_intent); 795 btree_node_unlock(trans, path, b->c.level); 796 bch2_path_put(trans, path_idx, true); 797 } 798 799 bch2_journal_pin_drop(&c->journal, &as->journal); 800 801 mutex_lock(&c->btree_interior_update_lock); 802 for (i = 0; i < as->nr_new_nodes; i++) { 803 b = as->new_nodes[i]; 804 805 BUG_ON(b->will_make_reachable != (unsigned long) as); 806 b->will_make_reachable = 0; 807 clear_btree_node_will_make_reachable(b); 808 } 809 mutex_unlock(&c->btree_interior_update_lock); 810 811 for (i = 0; i < as->nr_new_nodes; i++) { 812 b = as->new_nodes[i]; 813 814 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 815 btree_node_write_if_need(trans, b, SIX_LOCK_read); 816 six_unlock_read(&b->c.lock); 817 } 818 819 for (i = 0; i < as->nr_open_buckets; i++) 820 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); 821 822 bch2_btree_update_free(as, trans); 823 bch2_trans_put(trans); 824 } 825 826 static void btree_interior_update_work(struct work_struct *work) 827 { 828 struct bch_fs *c = 829 container_of(work, struct bch_fs, btree_interior_update_work); 830 struct btree_update *as; 831 832 while (1) { 833 mutex_lock(&c->btree_interior_update_lock); 834 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, 835 struct btree_update, unwritten_list); 836 if (as && !as->nodes_written) 837 as = NULL; 838 mutex_unlock(&c->btree_interior_update_lock); 839 840 if (!as) 841 break; 842 843 btree_update_nodes_written(as); 844 } 845 } 846 847 static CLOSURE_CALLBACK(btree_update_set_nodes_written) 848 { 849 closure_type(as, struct btree_update, cl); 850 struct bch_fs *c = as->c; 851 852 mutex_lock(&c->btree_interior_update_lock); 853 as->nodes_written = true; 854 mutex_unlock(&c->btree_interior_update_lock); 855 856 queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); 857 } 858 859 /* 860 * We're updating @b with pointers to nodes that haven't finished writing yet: 861 * block @b from being written until @as completes 862 */ 863 static void btree_update_updated_node(struct btree_update *as, struct btree *b) 864 { 865 struct bch_fs *c = as->c; 866 867 BUG_ON(as->mode != BTREE_UPDATE_none); 868 BUG_ON(as->update_level_end < b->c.level); 869 BUG_ON(!btree_node_dirty(b)); 870 BUG_ON(!b->c.level); 871 872 mutex_lock(&c->btree_interior_update_lock); 873 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 874 875 as->mode = BTREE_UPDATE_node; 876 as->b = b; 877 as->update_level_end = b->c.level; 878 879 set_btree_node_write_blocked(b); 880 list_add(&as->write_blocked_list, &b->write_blocked); 881 882 mutex_unlock(&c->btree_interior_update_lock); 883 } 884 885 static int bch2_update_reparent_journal_pin_flush(struct journal *j, 886 struct journal_entry_pin *_pin, u64 seq) 887 { 888 return 0; 889 } 890 891 static void btree_update_reparent(struct btree_update *as, 892 struct btree_update *child) 893 { 894 struct bch_fs *c = as->c; 895 896 lockdep_assert_held(&c->btree_interior_update_lock); 897 898 child->b = NULL; 899 child->mode = BTREE_UPDATE_update; 900 901 bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, 902 bch2_update_reparent_journal_pin_flush); 903 } 904 905 static void btree_update_updated_root(struct btree_update *as, struct btree *b) 906 { 907 struct bkey_i *insert = &b->key; 908 struct bch_fs *c = as->c; 909 910 BUG_ON(as->mode != BTREE_UPDATE_none); 911 912 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > 913 ARRAY_SIZE(as->journal_entries)); 914 915 as->journal_u64s += 916 journal_entry_set((void *) &as->journal_entries[as->journal_u64s], 917 BCH_JSET_ENTRY_btree_root, 918 b->c.btree_id, b->c.level, 919 insert, insert->k.u64s); 920 921 mutex_lock(&c->btree_interior_update_lock); 922 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 923 924 as->mode = BTREE_UPDATE_root; 925 mutex_unlock(&c->btree_interior_update_lock); 926 } 927 928 /* 929 * bch2_btree_update_add_new_node: 930 * 931 * This causes @as to wait on @b to be written, before it gets to 932 * bch2_btree_update_nodes_written 933 * 934 * Additionally, it sets b->will_make_reachable to prevent any additional writes 935 * to @b from happening besides the first until @b is reachable on disk 936 * 937 * And it adds @b to the list of @as's new nodes, so that we can update sector 938 * counts in bch2_btree_update_nodes_written: 939 */ 940 static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) 941 { 942 struct bch_fs *c = as->c; 943 944 closure_get(&as->cl); 945 946 mutex_lock(&c->btree_interior_update_lock); 947 BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); 948 BUG_ON(b->will_make_reachable); 949 950 as->new_nodes[as->nr_new_nodes++] = b; 951 b->will_make_reachable = 1UL|(unsigned long) as; 952 set_btree_node_will_make_reachable(b); 953 954 mutex_unlock(&c->btree_interior_update_lock); 955 956 btree_update_add_key(as, &as->new_keys, b); 957 958 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 959 unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; 960 unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; 961 962 bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = 963 cpu_to_le16(sectors); 964 } 965 } 966 967 /* 968 * returns true if @b was a new node 969 */ 970 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) 971 { 972 struct btree_update *as; 973 unsigned long v; 974 unsigned i; 975 976 mutex_lock(&c->btree_interior_update_lock); 977 /* 978 * When b->will_make_reachable != 0, it owns a ref on as->cl that's 979 * dropped when it gets written by bch2_btree_complete_write - the 980 * xchg() is for synchronization with bch2_btree_complete_write: 981 */ 982 v = xchg(&b->will_make_reachable, 0); 983 clear_btree_node_will_make_reachable(b); 984 as = (struct btree_update *) (v & ~1UL); 985 986 if (!as) { 987 mutex_unlock(&c->btree_interior_update_lock); 988 return; 989 } 990 991 for (i = 0; i < as->nr_new_nodes; i++) 992 if (as->new_nodes[i] == b) 993 goto found; 994 995 BUG(); 996 found: 997 array_remove_item(as->new_nodes, as->nr_new_nodes, i); 998 mutex_unlock(&c->btree_interior_update_lock); 999 1000 if (v & 1) 1001 closure_put(&as->cl); 1002 } 1003 1004 static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) 1005 { 1006 while (b->ob.nr) 1007 as->open_buckets[as->nr_open_buckets++] = 1008 b->ob.v[--b->ob.nr]; 1009 } 1010 1011 static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, 1012 struct journal_entry_pin *_pin, u64 seq) 1013 { 1014 return 0; 1015 } 1016 1017 /* 1018 * @b is being split/rewritten: it may have pointers to not-yet-written btree 1019 * nodes and thus outstanding btree_updates - redirect @b's 1020 * btree_updates to point to this btree_update: 1021 */ 1022 static void bch2_btree_interior_update_will_free_node(struct btree_update *as, 1023 struct btree *b) 1024 { 1025 struct bch_fs *c = as->c; 1026 struct btree_update *p, *n; 1027 struct btree_write *w; 1028 1029 set_btree_node_dying(b); 1030 1031 if (btree_node_fake(b)) 1032 return; 1033 1034 mutex_lock(&c->btree_interior_update_lock); 1035 1036 /* 1037 * Does this node have any btree_update operations preventing 1038 * it from being written? 1039 * 1040 * If so, redirect them to point to this btree_update: we can 1041 * write out our new nodes, but we won't make them visible until those 1042 * operations complete 1043 */ 1044 list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { 1045 list_del_init(&p->write_blocked_list); 1046 btree_update_reparent(as, p); 1047 1048 /* 1049 * for flush_held_btree_writes() waiting on updates to flush or 1050 * nodes to be writeable: 1051 */ 1052 closure_wake_up(&c->btree_interior_update_wait); 1053 } 1054 1055 clear_btree_node_dirty_acct(c, b); 1056 clear_btree_node_need_write(b); 1057 clear_btree_node_write_blocked(b); 1058 1059 /* 1060 * Does this node have unwritten data that has a pin on the journal? 1061 * 1062 * If so, transfer that pin to the btree_update operation - 1063 * note that if we're freeing multiple nodes, we only need to keep the 1064 * oldest pin of any of the nodes we're freeing. We'll release the pin 1065 * when the new nodes are persistent and reachable on disk: 1066 */ 1067 w = btree_current_write(b); 1068 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, 1069 bch2_btree_update_will_free_node_journal_pin_flush); 1070 bch2_journal_pin_drop(&c->journal, &w->journal); 1071 1072 w = btree_prev_write(b); 1073 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, 1074 bch2_btree_update_will_free_node_journal_pin_flush); 1075 bch2_journal_pin_drop(&c->journal, &w->journal); 1076 1077 mutex_unlock(&c->btree_interior_update_lock); 1078 1079 /* 1080 * Is this a node that isn't reachable on disk yet? 1081 * 1082 * Nodes that aren't reachable yet have writes blocked until they're 1083 * reachable - now that we've cancelled any pending writes and moved 1084 * things waiting on that write to wait on this update, we can drop this 1085 * node from the list of nodes that the other update is making 1086 * reachable, prior to freeing it: 1087 */ 1088 btree_update_drop_new_node(c, b); 1089 1090 btree_update_add_key(as, &as->old_keys, b); 1091 1092 as->old_nodes[as->nr_old_nodes] = b; 1093 as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; 1094 as->nr_old_nodes++; 1095 } 1096 1097 static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) 1098 { 1099 struct bch_fs *c = as->c; 1100 u64 start_time = as->start_time; 1101 1102 BUG_ON(as->mode == BTREE_UPDATE_none); 1103 1104 if (as->took_gc_lock) 1105 up_read(&as->c->gc_lock); 1106 as->took_gc_lock = false; 1107 1108 bch2_btree_reserve_put(as, trans); 1109 1110 continue_at(&as->cl, btree_update_set_nodes_written, 1111 as->c->btree_interior_update_worker); 1112 1113 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], 1114 start_time); 1115 } 1116 1117 static struct btree_update * 1118 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, 1119 unsigned level_start, bool split, unsigned flags) 1120 { 1121 struct bch_fs *c = trans->c; 1122 struct btree_update *as; 1123 u64 start_time = local_clock(); 1124 int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) 1125 ? BCH_DISK_RESERVATION_NOFAIL : 0; 1126 unsigned nr_nodes[2] = { 0, 0 }; 1127 unsigned level_end = level_start; 1128 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 1129 int ret = 0; 1130 u32 restart_count = trans->restart_count; 1131 1132 BUG_ON(!path->should_be_locked); 1133 1134 if (watermark == BCH_WATERMARK_copygc) 1135 watermark = BCH_WATERMARK_btree_copygc; 1136 if (watermark < BCH_WATERMARK_btree) 1137 watermark = BCH_WATERMARK_btree; 1138 1139 flags &= ~BCH_WATERMARK_MASK; 1140 flags |= watermark; 1141 1142 if (watermark < BCH_WATERMARK_reclaim && 1143 test_bit(JOURNAL_space_low, &c->journal.flags)) { 1144 if (flags & BCH_TRANS_COMMIT_journal_reclaim) 1145 return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); 1146 1147 ret = drop_locks_do(trans, 1148 ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); 1149 if (ret) 1150 return ERR_PTR(ret); 1151 } 1152 1153 while (1) { 1154 nr_nodes[!!level_end] += 1 + split; 1155 level_end++; 1156 1157 ret = bch2_btree_path_upgrade(trans, path, level_end + 1); 1158 if (ret) 1159 return ERR_PTR(ret); 1160 1161 if (!btree_path_node(path, level_end)) { 1162 /* Allocating new root? */ 1163 nr_nodes[1] += split; 1164 level_end = BTREE_MAX_DEPTH; 1165 break; 1166 } 1167 1168 /* 1169 * Always check for space for two keys, even if we won't have to 1170 * split at prior level - it might have been a merge instead: 1171 */ 1172 if (bch2_btree_node_insert_fits(path->l[level_end].b, 1173 BKEY_BTREE_PTR_U64s_MAX * 2)) 1174 break; 1175 1176 split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); 1177 } 1178 1179 if (!down_read_trylock(&c->gc_lock)) { 1180 ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); 1181 if (ret) { 1182 up_read(&c->gc_lock); 1183 return ERR_PTR(ret); 1184 } 1185 } 1186 1187 as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); 1188 memset(as, 0, sizeof(*as)); 1189 closure_init(&as->cl, NULL); 1190 as->c = c; 1191 as->start_time = start_time; 1192 as->ip_started = _RET_IP_; 1193 as->mode = BTREE_UPDATE_none; 1194 as->flags = flags; 1195 as->took_gc_lock = true; 1196 as->btree_id = path->btree_id; 1197 as->update_level_start = level_start; 1198 as->update_level_end = level_end; 1199 INIT_LIST_HEAD(&as->list); 1200 INIT_LIST_HEAD(&as->unwritten_list); 1201 INIT_LIST_HEAD(&as->write_blocked_list); 1202 bch2_keylist_init(&as->old_keys, as->_old_keys); 1203 bch2_keylist_init(&as->new_keys, as->_new_keys); 1204 bch2_keylist_init(&as->parent_keys, as->inline_keys); 1205 1206 mutex_lock(&c->btree_interior_update_lock); 1207 list_add_tail(&as->list, &c->btree_interior_update_list); 1208 mutex_unlock(&c->btree_interior_update_lock); 1209 1210 /* 1211 * We don't want to allocate if we're in an error state, that can cause 1212 * deadlock on emergency shutdown due to open buckets getting stuck in 1213 * the btree_reserve_cache after allocator shutdown has cleared it out. 1214 * This check needs to come after adding us to the btree_interior_update 1215 * list but before calling bch2_btree_reserve_get, to synchronize with 1216 * __bch2_fs_read_only(). 1217 */ 1218 ret = bch2_journal_error(&c->journal); 1219 if (ret) 1220 goto err; 1221 1222 ret = bch2_disk_reservation_get(c, &as->disk_res, 1223 (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), 1224 READ_ONCE(c->opts.metadata_replicas), 1225 disk_res_flags); 1226 if (ret) 1227 goto err; 1228 1229 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); 1230 if (bch2_err_matches(ret, ENOSPC) || 1231 bch2_err_matches(ret, ENOMEM)) { 1232 struct closure cl; 1233 1234 /* 1235 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK 1236 * flag 1237 */ 1238 if (bch2_err_matches(ret, ENOSPC) && 1239 (flags & BCH_TRANS_COMMIT_journal_reclaim) && 1240 watermark < BCH_WATERMARK_reclaim) { 1241 ret = -BCH_ERR_journal_reclaim_would_deadlock; 1242 goto err; 1243 } 1244 1245 closure_init_stack(&cl); 1246 1247 do { 1248 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); 1249 1250 bch2_trans_unlock(trans); 1251 bch2_wait_on_allocator(c, &cl); 1252 } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); 1253 } 1254 1255 if (ret) { 1256 trace_and_count(c, btree_reserve_get_fail, trans->fn, 1257 _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); 1258 goto err; 1259 } 1260 1261 ret = bch2_trans_relock(trans); 1262 if (ret) 1263 goto err; 1264 1265 bch2_trans_verify_not_restarted(trans, restart_count); 1266 return as; 1267 err: 1268 bch2_btree_update_free(as, trans); 1269 if (!bch2_err_matches(ret, ENOSPC) && 1270 !bch2_err_matches(ret, EROFS) && 1271 ret != -BCH_ERR_journal_reclaim_would_deadlock && 1272 ret != -BCH_ERR_journal_shutdown) 1273 bch_err_fn_ratelimited(c, ret); 1274 return ERR_PTR(ret); 1275 } 1276 1277 /* Btree root updates: */ 1278 1279 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) 1280 { 1281 /* Root nodes cannot be reaped */ 1282 mutex_lock(&c->btree_cache.lock); 1283 list_del_init(&b->list); 1284 mutex_unlock(&c->btree_cache.lock); 1285 1286 mutex_lock(&c->btree_root_lock); 1287 bch2_btree_id_root(c, b->c.btree_id)->b = b; 1288 mutex_unlock(&c->btree_root_lock); 1289 1290 bch2_recalc_btree_reserve(c); 1291 } 1292 1293 static int bch2_btree_set_root(struct btree_update *as, 1294 struct btree_trans *trans, 1295 struct btree_path *path, 1296 struct btree *b, 1297 bool nofail) 1298 { 1299 struct bch_fs *c = as->c; 1300 1301 trace_and_count(c, btree_node_set_root, trans, b); 1302 1303 struct btree *old = btree_node_root(c, b); 1304 1305 /* 1306 * Ensure no one is using the old root while we switch to the 1307 * new root: 1308 */ 1309 if (nofail) { 1310 bch2_btree_node_lock_write_nofail(trans, path, &old->c); 1311 } else { 1312 int ret = bch2_btree_node_lock_write(trans, path, &old->c); 1313 if (ret) 1314 return ret; 1315 } 1316 1317 bch2_btree_set_root_inmem(c, b); 1318 1319 btree_update_updated_root(as, b); 1320 1321 /* 1322 * Unlock old root after new root is visible: 1323 * 1324 * The new root isn't persistent, but that's ok: we still have 1325 * an intent lock on the new root, and any updates that would 1326 * depend on the new root would have to update the new root. 1327 */ 1328 bch2_btree_node_unlock_write(trans, path, old); 1329 return 0; 1330 } 1331 1332 /* Interior node updates: */ 1333 1334 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, 1335 struct btree_trans *trans, 1336 struct btree_path *path, 1337 struct btree *b, 1338 struct btree_node_iter *node_iter, 1339 struct bkey_i *insert) 1340 { 1341 struct bch_fs *c = as->c; 1342 struct bkey_packed *k; 1343 struct printbuf buf = PRINTBUF; 1344 unsigned long old, new; 1345 1346 BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && 1347 !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); 1348 1349 if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) 1350 bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); 1351 1352 struct bkey_validate_context from = (struct bkey_validate_context) { 1353 .from = BKEY_VALIDATE_btree_node, 1354 .level = b->c.level, 1355 .btree = b->c.btree_id, 1356 .flags = BCH_VALIDATE_commit, 1357 }; 1358 if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: 1359 bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { 1360 bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); 1361 dump_stack(); 1362 } 1363 1364 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > 1365 ARRAY_SIZE(as->journal_entries)); 1366 1367 as->journal_u64s += 1368 journal_entry_set((void *) &as->journal_entries[as->journal_u64s], 1369 BCH_JSET_ENTRY_btree_keys, 1370 b->c.btree_id, b->c.level, 1371 insert, insert->k.u64s); 1372 1373 while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && 1374 bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) 1375 bch2_btree_node_iter_advance(node_iter, b); 1376 1377 bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); 1378 set_btree_node_dirty_acct(c, b); 1379 1380 old = READ_ONCE(b->flags); 1381 do { 1382 new = old; 1383 1384 new &= ~BTREE_WRITE_TYPE_MASK; 1385 new |= BTREE_WRITE_interior; 1386 new |= 1 << BTREE_NODE_need_write; 1387 } while (!try_cmpxchg(&b->flags, &old, new)); 1388 1389 printbuf_exit(&buf); 1390 } 1391 1392 static int 1393 bch2_btree_insert_keys_interior(struct btree_update *as, 1394 struct btree_trans *trans, 1395 struct btree_path *path, 1396 struct btree *b, 1397 struct btree_node_iter node_iter, 1398 struct keylist *keys) 1399 { 1400 struct bkey_i *insert = bch2_keylist_front(keys); 1401 struct bkey_packed *k; 1402 1403 BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); 1404 1405 while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && 1406 (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) 1407 ; 1408 1409 for (; 1410 insert != keys->top && bpos_le(insert->k.p, b->key.k.p); 1411 insert = bkey_next(insert)) 1412 bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); 1413 1414 int ret = bch2_btree_node_check_topology(trans, b); 1415 if (ret) { 1416 struct printbuf buf = PRINTBUF; 1417 1418 for (struct bkey_i *k = keys->keys; 1419 k != insert; 1420 k = bkey_next(k)) { 1421 bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); 1422 prt_newline(&buf); 1423 } 1424 1425 bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", 1426 (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); 1427 dump_stack(); 1428 return ret; 1429 } 1430 1431 memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); 1432 keys->top_p -= insert->_data - keys->keys_p; 1433 return 0; 1434 } 1435 1436 static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) 1437 { 1438 if (insert_keys) 1439 for_each_keylist_key(insert_keys, k) 1440 if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) 1441 return true; 1442 return false; 1443 } 1444 1445 /* 1446 * Move keys from n1 (original replacement node, now lower node) to n2 (higher 1447 * node) 1448 */ 1449 static void __btree_split_node(struct btree_update *as, 1450 struct btree_trans *trans, 1451 struct btree *b, 1452 struct btree *n[2], 1453 struct keylist *insert_keys) 1454 { 1455 struct bkey_packed *k; 1456 struct bpos n1_pos = POS_MIN; 1457 struct btree_node_iter iter; 1458 struct bset *bsets[2]; 1459 struct bkey_format_state format[2]; 1460 struct bkey_packed *out[2]; 1461 struct bkey uk; 1462 unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; 1463 struct { unsigned nr_keys, val_u64s; } nr_keys[2]; 1464 int i; 1465 1466 memset(&nr_keys, 0, sizeof(nr_keys)); 1467 1468 for (i = 0; i < 2; i++) { 1469 BUG_ON(n[i]->nsets != 1); 1470 1471 bsets[i] = btree_bset_first(n[i]); 1472 out[i] = bsets[i]->start; 1473 1474 SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); 1475 bch2_bkey_format_init(&format[i]); 1476 } 1477 1478 u64s = 0; 1479 for_each_btree_node_key(b, k, &iter) { 1480 if (bkey_deleted(k)) 1481 continue; 1482 1483 uk = bkey_unpack_key(b, k); 1484 1485 if (b->c.level && 1486 u64s < n1_u64s && 1487 u64s + k->u64s >= n1_u64s && 1488 (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || 1489 key_deleted_in_insert(insert_keys, uk.p))) 1490 n1_u64s += k->u64s; 1491 1492 i = u64s >= n1_u64s; 1493 u64s += k->u64s; 1494 if (!i) 1495 n1_pos = uk.p; 1496 bch2_bkey_format_add_key(&format[i], &uk); 1497 1498 nr_keys[i].nr_keys++; 1499 nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); 1500 } 1501 1502 btree_set_min(n[0], b->data->min_key); 1503 btree_set_max(n[0], n1_pos); 1504 btree_set_min(n[1], bpos_successor(n1_pos)); 1505 btree_set_max(n[1], b->data->max_key); 1506 1507 for (i = 0; i < 2; i++) { 1508 bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); 1509 bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); 1510 1511 n[i]->data->format = bch2_bkey_format_done(&format[i]); 1512 1513 unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + 1514 nr_keys[i].val_u64s; 1515 if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) 1516 n[i]->data->format = b->format; 1517 1518 btree_node_set_format(n[i], n[i]->data->format); 1519 } 1520 1521 u64s = 0; 1522 for_each_btree_node_key(b, k, &iter) { 1523 if (bkey_deleted(k)) 1524 continue; 1525 1526 i = u64s >= n1_u64s; 1527 u64s += k->u64s; 1528 1529 if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) 1530 ? &b->format: &bch2_bkey_format_current, k)) 1531 out[i]->format = KEY_FORMAT_LOCAL_BTREE; 1532 else 1533 bch2_bkey_unpack(b, (void *) out[i], k); 1534 1535 out[i]->needs_whiteout = false; 1536 1537 btree_keys_account_key_add(&n[i]->nr, 0, out[i]); 1538 out[i] = bkey_p_next(out[i]); 1539 } 1540 1541 for (i = 0; i < 2; i++) { 1542 bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); 1543 1544 BUG_ON(!bsets[i]->u64s); 1545 1546 set_btree_bset_end(n[i], n[i]->set); 1547 1548 btree_node_reset_sib_u64s(n[i]); 1549 1550 bch2_verify_btree_nr_keys(n[i]); 1551 1552 BUG_ON(bch2_btree_node_check_topology(trans, n[i])); 1553 } 1554 } 1555 1556 /* 1557 * For updates to interior nodes, we've got to do the insert before we split 1558 * because the stuff we're inserting has to be inserted atomically. Post split, 1559 * the keys might have to go in different nodes and the split would no longer be 1560 * atomic. 1561 * 1562 * Worse, if the insert is from btree node coalescing, if we do the insert after 1563 * we do the split (and pick the pivot) - the pivot we pick might be between 1564 * nodes that were coalesced, and thus in the middle of a child node post 1565 * coalescing: 1566 */ 1567 static int btree_split_insert_keys(struct btree_update *as, 1568 struct btree_trans *trans, 1569 btree_path_idx_t path_idx, 1570 struct btree *b, 1571 struct keylist *keys) 1572 { 1573 struct btree_path *path = trans->paths + path_idx; 1574 1575 if (!bch2_keylist_empty(keys) && 1576 bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { 1577 struct btree_node_iter node_iter; 1578 1579 bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); 1580 1581 int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); 1582 if (ret) 1583 return ret; 1584 } 1585 1586 return 0; 1587 } 1588 1589 static int btree_split(struct btree_update *as, struct btree_trans *trans, 1590 btree_path_idx_t path, struct btree *b, 1591 struct keylist *keys) 1592 { 1593 struct bch_fs *c = as->c; 1594 struct btree *parent = btree_node_parent(trans->paths + path, b); 1595 struct btree *n1, *n2 = NULL, *n3 = NULL; 1596 btree_path_idx_t path1 = 0, path2 = 0; 1597 u64 start_time = local_clock(); 1598 int ret = 0; 1599 1600 bch2_verify_btree_nr_keys(b); 1601 BUG_ON(!parent && (b != btree_node_root(c, b))); 1602 BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); 1603 1604 ret = bch2_btree_node_check_topology(trans, b); 1605 if (ret) 1606 return ret; 1607 1608 if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { 1609 struct btree *n[2]; 1610 1611 trace_and_count(c, btree_node_split, trans, b); 1612 1613 n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); 1614 n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); 1615 1616 __btree_split_node(as, trans, b, n, keys); 1617 1618 if (keys) { 1619 ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: 1620 btree_split_insert_keys(as, trans, path, n2, keys); 1621 if (ret) 1622 goto err; 1623 BUG_ON(!bch2_keylist_empty(keys)); 1624 } 1625 1626 bch2_btree_build_aux_trees(n2); 1627 bch2_btree_build_aux_trees(n1); 1628 1629 bch2_btree_update_add_new_node(as, n1); 1630 bch2_btree_update_add_new_node(as, n2); 1631 six_unlock_write(&n2->c.lock); 1632 six_unlock_write(&n1->c.lock); 1633 1634 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); 1635 six_lock_increment(&n1->c.lock, SIX_LOCK_intent); 1636 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); 1637 bch2_btree_path_level_init(trans, trans->paths + path1, n1); 1638 1639 path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); 1640 six_lock_increment(&n2->c.lock, SIX_LOCK_intent); 1641 mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); 1642 bch2_btree_path_level_init(trans, trans->paths + path2, n2); 1643 1644 /* 1645 * Note that on recursive parent_keys == keys, so we 1646 * can't start adding new keys to parent_keys before emptying it 1647 * out (which we did with btree_split_insert_keys() above) 1648 */ 1649 bch2_keylist_add(&as->parent_keys, &n1->key); 1650 bch2_keylist_add(&as->parent_keys, &n2->key); 1651 1652 if (!parent) { 1653 /* Depth increases, make a new root */ 1654 n3 = __btree_root_alloc(as, trans, b->c.level + 1); 1655 1656 bch2_btree_update_add_new_node(as, n3); 1657 six_unlock_write(&n3->c.lock); 1658 1659 trans->paths[path2].locks_want++; 1660 BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); 1661 six_lock_increment(&n3->c.lock, SIX_LOCK_intent); 1662 mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); 1663 bch2_btree_path_level_init(trans, trans->paths + path2, n3); 1664 1665 n3->sib_u64s[0] = U16_MAX; 1666 n3->sib_u64s[1] = U16_MAX; 1667 1668 ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); 1669 if (ret) 1670 goto err; 1671 } 1672 } else { 1673 trace_and_count(c, btree_node_compact, trans, b); 1674 1675 n1 = bch2_btree_node_alloc_replacement(as, trans, b); 1676 1677 if (keys) { 1678 ret = btree_split_insert_keys(as, trans, path, n1, keys); 1679 if (ret) 1680 goto err; 1681 BUG_ON(!bch2_keylist_empty(keys)); 1682 } 1683 1684 bch2_btree_build_aux_trees(n1); 1685 bch2_btree_update_add_new_node(as, n1); 1686 six_unlock_write(&n1->c.lock); 1687 1688 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); 1689 six_lock_increment(&n1->c.lock, SIX_LOCK_intent); 1690 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); 1691 bch2_btree_path_level_init(trans, trans->paths + path1, n1); 1692 1693 if (parent) 1694 bch2_keylist_add(&as->parent_keys, &n1->key); 1695 } 1696 1697 /* New nodes all written, now make them visible: */ 1698 1699 if (parent) { 1700 /* Split a non root node */ 1701 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 1702 } else if (n3) { 1703 ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); 1704 } else { 1705 /* Root filled up but didn't need to be split */ 1706 ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); 1707 } 1708 1709 if (ret) 1710 goto err; 1711 1712 bch2_btree_interior_update_will_free_node(as, b); 1713 1714 if (n3) { 1715 bch2_btree_update_get_open_buckets(as, n3); 1716 bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); 1717 } 1718 if (n2) { 1719 bch2_btree_update_get_open_buckets(as, n2); 1720 bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); 1721 } 1722 bch2_btree_update_get_open_buckets(as, n1); 1723 bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); 1724 1725 /* 1726 * The old node must be freed (in memory) _before_ unlocking the new 1727 * nodes - else another thread could re-acquire a read lock on the old 1728 * node after another thread has locked and updated the new node, thus 1729 * seeing stale data: 1730 */ 1731 bch2_btree_node_free_inmem(trans, trans->paths + path, b); 1732 1733 if (n3) 1734 bch2_trans_node_add(trans, trans->paths + path, n3); 1735 if (n2) 1736 bch2_trans_node_add(trans, trans->paths + path2, n2); 1737 bch2_trans_node_add(trans, trans->paths + path1, n1); 1738 1739 if (n3) 1740 six_unlock_intent(&n3->c.lock); 1741 if (n2) 1742 six_unlock_intent(&n2->c.lock); 1743 six_unlock_intent(&n1->c.lock); 1744 out: 1745 if (path2) { 1746 __bch2_btree_path_unlock(trans, trans->paths + path2); 1747 bch2_path_put(trans, path2, true); 1748 } 1749 if (path1) { 1750 __bch2_btree_path_unlock(trans, trans->paths + path1); 1751 bch2_path_put(trans, path1, true); 1752 } 1753 1754 bch2_trans_verify_locks(trans); 1755 1756 bch2_time_stats_update(&c->times[n2 1757 ? BCH_TIME_btree_node_split 1758 : BCH_TIME_btree_node_compact], 1759 start_time); 1760 return ret; 1761 err: 1762 if (n3) 1763 bch2_btree_node_free_never_used(as, trans, n3); 1764 if (n2) 1765 bch2_btree_node_free_never_used(as, trans, n2); 1766 bch2_btree_node_free_never_used(as, trans, n1); 1767 goto out; 1768 } 1769 1770 /** 1771 * bch2_btree_insert_node - insert bkeys into a given btree node 1772 * 1773 * @as: btree_update object 1774 * @trans: btree_trans object 1775 * @path_idx: path that points to current node 1776 * @b: node to insert keys into 1777 * @keys: list of keys to insert 1778 * 1779 * Returns: 0 on success, typically transaction restart error on failure 1780 * 1781 * Inserts as many keys as it can into a given btree node, splitting it if full. 1782 * If a split occurred, this function will return early. This can only happen 1783 * for leaf nodes -- inserts into interior nodes have to be atomic. 1784 */ 1785 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, 1786 btree_path_idx_t path_idx, struct btree *b, 1787 struct keylist *keys) 1788 { 1789 struct bch_fs *c = as->c; 1790 struct btree_path *path = trans->paths + path_idx, *linked; 1791 unsigned i; 1792 int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); 1793 int old_live_u64s = b->nr.live_u64s; 1794 int live_u64s_added, u64s_added; 1795 int ret; 1796 1797 lockdep_assert_held(&c->gc_lock); 1798 BUG_ON(!b->c.level); 1799 BUG_ON(!as || as->b); 1800 bch2_verify_keylist_sorted(keys); 1801 1802 if (!btree_node_intent_locked(path, b->c.level)) { 1803 struct printbuf buf = PRINTBUF; 1804 bch2_log_msg_start(c, &buf); 1805 prt_printf(&buf, "%s(): node not locked at level %u\n", 1806 __func__, b->c.level); 1807 bch2_btree_update_to_text(&buf, as); 1808 bch2_btree_path_to_text(&buf, trans, path_idx); 1809 1810 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1811 printbuf_exit(&buf); 1812 bch2_fs_emergency_read_only(c); 1813 return -EIO; 1814 } 1815 1816 ret = bch2_btree_node_lock_write(trans, path, &b->c); 1817 if (ret) 1818 return ret; 1819 1820 bch2_btree_node_prep_for_write(trans, path, b); 1821 1822 if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { 1823 bch2_btree_node_unlock_write(trans, path, b); 1824 goto split; 1825 } 1826 1827 1828 ret = bch2_btree_node_check_topology(trans, b) ?: 1829 bch2_btree_insert_keys_interior(as, trans, path, b, 1830 path->l[b->c.level].iter, keys); 1831 if (ret) { 1832 bch2_btree_node_unlock_write(trans, path, b); 1833 return ret; 1834 } 1835 1836 trans_for_each_path_with_node(trans, b, linked, i) 1837 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); 1838 1839 bch2_trans_verify_paths(trans); 1840 1841 live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; 1842 u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; 1843 1844 if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) 1845 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); 1846 if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) 1847 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); 1848 1849 if (u64s_added > live_u64s_added && 1850 bch2_maybe_compact_whiteouts(c, b)) 1851 bch2_trans_node_reinit_iter(trans, b); 1852 1853 btree_update_updated_node(as, b); 1854 bch2_btree_node_unlock_write(trans, path, b); 1855 return 0; 1856 split: 1857 /* 1858 * We could attempt to avoid the transaction restart, by calling 1859 * bch2_btree_path_upgrade() and allocating more nodes: 1860 */ 1861 if (b->c.level >= as->update_level_end) { 1862 trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); 1863 return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); 1864 } 1865 1866 return btree_split(as, trans, path_idx, b, keys); 1867 } 1868 1869 int bch2_btree_split_leaf(struct btree_trans *trans, 1870 btree_path_idx_t path, 1871 unsigned flags) 1872 { 1873 /* btree_split & merge may both cause paths array to be reallocated */ 1874 struct btree *b = path_l(trans->paths + path)->b; 1875 struct btree_update *as; 1876 unsigned l; 1877 int ret = 0; 1878 1879 as = bch2_btree_update_start(trans, trans->paths + path, 1880 trans->paths[path].level, 1881 true, flags); 1882 if (IS_ERR(as)) 1883 return PTR_ERR(as); 1884 1885 ret = btree_split(as, trans, path, b, NULL); 1886 if (ret) { 1887 bch2_btree_update_free(as, trans); 1888 return ret; 1889 } 1890 1891 bch2_btree_update_done(as, trans); 1892 1893 for (l = trans->paths[path].level + 1; 1894 btree_node_intent_locked(&trans->paths[path], l) && !ret; 1895 l++) 1896 ret = bch2_foreground_maybe_merge(trans, path, l, flags); 1897 1898 return ret; 1899 } 1900 1901 static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, 1902 btree_path_idx_t path_idx) 1903 { 1904 struct bch_fs *c = as->c; 1905 struct btree_path *path = trans->paths + path_idx; 1906 struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; 1907 1908 BUG_ON(!btree_node_locked(path, b->c.level)); 1909 1910 n = __btree_root_alloc(as, trans, b->c.level + 1); 1911 1912 bch2_btree_update_add_new_node(as, n); 1913 six_unlock_write(&n->c.lock); 1914 1915 path->locks_want++; 1916 BUG_ON(btree_node_locked(path, n->c.level)); 1917 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 1918 mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); 1919 bch2_btree_path_level_init(trans, path, n); 1920 1921 n->sib_u64s[0] = U16_MAX; 1922 n->sib_u64s[1] = U16_MAX; 1923 1924 bch2_keylist_add(&as->parent_keys, &b->key); 1925 btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); 1926 1927 int ret = bch2_btree_set_root(as, trans, path, n, true); 1928 BUG_ON(ret); 1929 1930 bch2_btree_update_get_open_buckets(as, n); 1931 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 1932 bch2_trans_node_add(trans, path, n); 1933 six_unlock_intent(&n->c.lock); 1934 1935 mutex_lock(&c->btree_cache.lock); 1936 list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); 1937 mutex_unlock(&c->btree_cache.lock); 1938 1939 bch2_trans_verify_locks(trans); 1940 } 1941 1942 int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) 1943 { 1944 struct bch_fs *c = trans->c; 1945 struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; 1946 1947 if (btree_node_fake(b)) 1948 return bch2_btree_split_leaf(trans, path, flags); 1949 1950 struct btree_update *as = 1951 bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); 1952 if (IS_ERR(as)) 1953 return PTR_ERR(as); 1954 1955 __btree_increase_depth(as, trans, path); 1956 bch2_btree_update_done(as, trans); 1957 return 0; 1958 } 1959 1960 int __bch2_foreground_maybe_merge(struct btree_trans *trans, 1961 btree_path_idx_t path, 1962 unsigned level, 1963 unsigned flags, 1964 enum btree_node_sibling sib) 1965 { 1966 struct bch_fs *c = trans->c; 1967 struct btree_update *as; 1968 struct bkey_format_state new_s; 1969 struct bkey_format new_f; 1970 struct bkey_i delete; 1971 struct btree *b, *m, *n, *prev, *next, *parent; 1972 struct bpos sib_pos; 1973 size_t sib_u64s; 1974 enum btree_id btree = trans->paths[path].btree_id; 1975 btree_path_idx_t sib_path = 0, new_path = 0; 1976 u64 start_time = local_clock(); 1977 int ret = 0; 1978 1979 bch2_trans_verify_not_unlocked_or_in_restart(trans); 1980 BUG_ON(!trans->paths[path].should_be_locked); 1981 BUG_ON(!btree_node_locked(&trans->paths[path], level)); 1982 1983 /* 1984 * Work around a deadlock caused by the btree write buffer not doing 1985 * merges and leaving tons of merges for us to do - we really don't need 1986 * to be doing merges at all from the interior update path, and if the 1987 * interior update path is generating too many new interior updates we 1988 * deadlock: 1989 */ 1990 if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) 1991 return 0; 1992 1993 if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { 1994 flags &= ~BCH_WATERMARK_MASK; 1995 flags |= BCH_WATERMARK_btree; 1996 flags |= BCH_TRANS_COMMIT_journal_reclaim; 1997 } 1998 1999 b = trans->paths[path].l[level].b; 2000 2001 if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || 2002 (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { 2003 b->sib_u64s[sib] = U16_MAX; 2004 return 0; 2005 } 2006 2007 sib_pos = sib == btree_prev_sib 2008 ? bpos_predecessor(b->data->min_key) 2009 : bpos_successor(b->data->max_key); 2010 2011 sib_path = bch2_path_get(trans, btree, sib_pos, 2012 U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); 2013 ret = bch2_btree_path_traverse(trans, sib_path, false); 2014 if (ret) 2015 goto err; 2016 2017 btree_path_set_should_be_locked(trans, trans->paths + sib_path); 2018 2019 m = trans->paths[sib_path].l[level].b; 2020 2021 if (btree_node_parent(trans->paths + path, b) != 2022 btree_node_parent(trans->paths + sib_path, m)) { 2023 b->sib_u64s[sib] = U16_MAX; 2024 goto out; 2025 } 2026 2027 if (sib == btree_prev_sib) { 2028 prev = m; 2029 next = b; 2030 } else { 2031 prev = b; 2032 next = m; 2033 } 2034 2035 if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { 2036 struct printbuf buf = PRINTBUF; 2037 2038 printbuf_indent_add_nextline(&buf, 2); 2039 prt_printf(&buf, "%s(): ", __func__); 2040 ret = __bch2_topology_error(c, &buf); 2041 prt_newline(&buf); 2042 2043 prt_printf(&buf, "prev ends at "); 2044 bch2_bpos_to_text(&buf, prev->data->max_key); 2045 prt_newline(&buf); 2046 2047 prt_printf(&buf, "next starts at "); 2048 bch2_bpos_to_text(&buf, next->data->min_key); 2049 2050 bch_err(c, "%s", buf.buf); 2051 printbuf_exit(&buf); 2052 goto err; 2053 } 2054 2055 bch2_bkey_format_init(&new_s); 2056 bch2_bkey_format_add_pos(&new_s, prev->data->min_key); 2057 __bch2_btree_calc_format(&new_s, prev); 2058 __bch2_btree_calc_format(&new_s, next); 2059 bch2_bkey_format_add_pos(&new_s, next->data->max_key); 2060 new_f = bch2_bkey_format_done(&new_s); 2061 2062 sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + 2063 btree_node_u64s_with_format(m->nr, &m->format, &new_f); 2064 2065 if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { 2066 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); 2067 sib_u64s /= 2; 2068 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); 2069 } 2070 2071 sib_u64s = min(sib_u64s, btree_max_u64s(c)); 2072 sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); 2073 b->sib_u64s[sib] = sib_u64s; 2074 2075 if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) 2076 goto out; 2077 2078 parent = btree_node_parent(trans->paths + path, b); 2079 as = bch2_btree_update_start(trans, trans->paths + path, level, false, 2080 BCH_TRANS_COMMIT_no_enospc|flags); 2081 ret = PTR_ERR_OR_ZERO(as); 2082 if (ret) 2083 goto err; 2084 2085 trace_and_count(c, btree_node_merge, trans, b); 2086 2087 n = bch2_btree_node_alloc(as, trans, b->c.level); 2088 2089 SET_BTREE_NODE_SEQ(n->data, 2090 max(BTREE_NODE_SEQ(b->data), 2091 BTREE_NODE_SEQ(m->data)) + 1); 2092 2093 btree_set_min(n, prev->data->min_key); 2094 btree_set_max(n, next->data->max_key); 2095 2096 n->data->format = new_f; 2097 btree_node_set_format(n, new_f); 2098 2099 bch2_btree_sort_into(c, n, prev); 2100 bch2_btree_sort_into(c, n, next); 2101 2102 bch2_btree_build_aux_trees(n); 2103 bch2_btree_update_add_new_node(as, n); 2104 six_unlock_write(&n->c.lock); 2105 2106 new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); 2107 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 2108 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); 2109 bch2_btree_path_level_init(trans, trans->paths + new_path, n); 2110 2111 bkey_init(&delete.k); 2112 delete.k.p = prev->key.k.p; 2113 bch2_keylist_add(&as->parent_keys, &delete); 2114 bch2_keylist_add(&as->parent_keys, &n->key); 2115 2116 bch2_trans_verify_paths(trans); 2117 2118 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 2119 if (ret) 2120 goto err_free_update; 2121 2122 bch2_btree_interior_update_will_free_node(as, b); 2123 bch2_btree_interior_update_will_free_node(as, m); 2124 2125 bch2_trans_verify_paths(trans); 2126 2127 bch2_btree_update_get_open_buckets(as, n); 2128 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 2129 2130 bch2_btree_node_free_inmem(trans, trans->paths + path, b); 2131 bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); 2132 2133 bch2_trans_node_add(trans, trans->paths + path, n); 2134 2135 bch2_trans_verify_paths(trans); 2136 2137 six_unlock_intent(&n->c.lock); 2138 2139 bch2_btree_update_done(as, trans); 2140 2141 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); 2142 out: 2143 err: 2144 if (new_path) 2145 bch2_path_put(trans, new_path, true); 2146 bch2_path_put(trans, sib_path, true); 2147 bch2_trans_verify_locks(trans); 2148 if (ret == -BCH_ERR_journal_reclaim_would_deadlock) 2149 ret = 0; 2150 if (!ret) 2151 ret = bch2_trans_relock(trans); 2152 return ret; 2153 err_free_update: 2154 bch2_btree_node_free_never_used(as, trans, n); 2155 bch2_btree_update_free(as, trans); 2156 goto out; 2157 } 2158 2159 static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, 2160 struct btree *b) 2161 { 2162 bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, 2163 BTREE_MAX_DEPTH, b->c.level, 2164 BTREE_ITER_intent); 2165 int ret = bch2_btree_iter_traverse(trans, iter); 2166 if (ret) 2167 goto err; 2168 2169 /* has node been freed? */ 2170 if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { 2171 /* node has been freed: */ 2172 BUG_ON(!btree_node_dying(b)); 2173 ret = -BCH_ERR_btree_node_dying; 2174 goto err; 2175 } 2176 2177 BUG_ON(!btree_node_hashed(b)); 2178 return 0; 2179 err: 2180 bch2_trans_iter_exit(trans, iter); 2181 return ret; 2182 } 2183 2184 int bch2_btree_node_rewrite(struct btree_trans *trans, 2185 struct btree_iter *iter, 2186 struct btree *b, 2187 unsigned flags) 2188 { 2189 struct bch_fs *c = trans->c; 2190 struct btree *n, *parent; 2191 struct btree_update *as; 2192 btree_path_idx_t new_path = 0; 2193 int ret; 2194 2195 flags |= BCH_TRANS_COMMIT_no_enospc; 2196 2197 struct btree_path *path = btree_iter_path(trans, iter); 2198 parent = btree_node_parent(path, b); 2199 as = bch2_btree_update_start(trans, path, b->c.level, false, flags); 2200 ret = PTR_ERR_OR_ZERO(as); 2201 if (ret) 2202 goto out; 2203 2204 n = bch2_btree_node_alloc_replacement(as, trans, b); 2205 2206 bch2_btree_build_aux_trees(n); 2207 bch2_btree_update_add_new_node(as, n); 2208 six_unlock_write(&n->c.lock); 2209 2210 new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); 2211 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 2212 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); 2213 bch2_btree_path_level_init(trans, trans->paths + new_path, n); 2214 2215 trace_and_count(c, btree_node_rewrite, trans, b); 2216 2217 if (parent) { 2218 bch2_keylist_add(&as->parent_keys, &n->key); 2219 ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); 2220 } else { 2221 ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); 2222 } 2223 2224 if (ret) 2225 goto err; 2226 2227 bch2_btree_interior_update_will_free_node(as, b); 2228 2229 bch2_btree_update_get_open_buckets(as, n); 2230 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 2231 2232 bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); 2233 2234 bch2_trans_node_add(trans, trans->paths + iter->path, n); 2235 six_unlock_intent(&n->c.lock); 2236 2237 bch2_btree_update_done(as, trans); 2238 out: 2239 if (new_path) 2240 bch2_path_put(trans, new_path, true); 2241 bch2_trans_downgrade(trans); 2242 return ret; 2243 err: 2244 bch2_btree_node_free_never_used(as, trans, n); 2245 bch2_btree_update_free(as, trans); 2246 goto out; 2247 } 2248 2249 static int bch2_btree_node_rewrite_key(struct btree_trans *trans, 2250 enum btree_id btree, unsigned level, 2251 struct bkey_i *k, unsigned flags) 2252 { 2253 struct btree_iter iter; 2254 bch2_trans_node_iter_init(trans, &iter, 2255 btree, k->k.p, 2256 BTREE_MAX_DEPTH, level, 0); 2257 struct btree *b = bch2_btree_iter_peek_node(trans, &iter); 2258 int ret = PTR_ERR_OR_ZERO(b); 2259 if (ret) 2260 goto out; 2261 2262 bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); 2263 ret = found 2264 ? bch2_btree_node_rewrite(trans, &iter, b, flags) 2265 : -ENOENT; 2266 out: 2267 bch2_trans_iter_exit(trans, &iter); 2268 return ret; 2269 } 2270 2271 int bch2_btree_node_rewrite_pos(struct btree_trans *trans, 2272 enum btree_id btree, unsigned level, 2273 struct bpos pos, unsigned flags) 2274 { 2275 BUG_ON(!level); 2276 2277 /* Traverse one depth lower to get a pointer to the node itself: */ 2278 struct btree_iter iter; 2279 bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); 2280 struct btree *b = bch2_btree_iter_peek_node(trans, &iter); 2281 int ret = PTR_ERR_OR_ZERO(b); 2282 if (ret) 2283 goto err; 2284 2285 ret = bch2_btree_node_rewrite(trans, &iter, b, flags); 2286 err: 2287 bch2_trans_iter_exit(trans, &iter); 2288 return ret; 2289 } 2290 2291 int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, 2292 struct btree *b, unsigned flags) 2293 { 2294 struct btree_iter iter; 2295 int ret = get_iter_to_node(trans, &iter, b); 2296 if (ret) 2297 return ret == -BCH_ERR_btree_node_dying ? 0 : ret; 2298 2299 ret = bch2_btree_node_rewrite(trans, &iter, b, flags); 2300 bch2_trans_iter_exit(trans, &iter); 2301 return ret; 2302 } 2303 2304 struct async_btree_rewrite { 2305 struct bch_fs *c; 2306 struct work_struct work; 2307 struct list_head list; 2308 enum btree_id btree_id; 2309 unsigned level; 2310 struct bkey_buf key; 2311 }; 2312 2313 static void async_btree_node_rewrite_work(struct work_struct *work) 2314 { 2315 struct async_btree_rewrite *a = 2316 container_of(work, struct async_btree_rewrite, work); 2317 struct bch_fs *c = a->c; 2318 2319 int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, 2320 a->btree_id, a->level, a->key.k, 0)); 2321 if (ret != -ENOENT && 2322 !bch2_err_matches(ret, EROFS) && 2323 ret != -BCH_ERR_journal_shutdown) 2324 bch_err_fn_ratelimited(c, ret); 2325 2326 spin_lock(&c->btree_node_rewrites_lock); 2327 list_del(&a->list); 2328 spin_unlock(&c->btree_node_rewrites_lock); 2329 2330 closure_wake_up(&c->btree_node_rewrites_wait); 2331 2332 bch2_bkey_buf_exit(&a->key, c); 2333 bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); 2334 kfree(a); 2335 } 2336 2337 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) 2338 { 2339 struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); 2340 if (!a) 2341 return; 2342 2343 a->c = c; 2344 a->btree_id = b->c.btree_id; 2345 a->level = b->c.level; 2346 INIT_WORK(&a->work, async_btree_node_rewrite_work); 2347 2348 bch2_bkey_buf_init(&a->key); 2349 bch2_bkey_buf_copy(&a->key, c, &b->key); 2350 2351 bool now = false, pending = false; 2352 2353 spin_lock(&c->btree_node_rewrites_lock); 2354 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && 2355 bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { 2356 list_add(&a->list, &c->btree_node_rewrites); 2357 now = true; 2358 } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { 2359 list_add(&a->list, &c->btree_node_rewrites_pending); 2360 pending = true; 2361 } 2362 spin_unlock(&c->btree_node_rewrites_lock); 2363 2364 if (now) { 2365 queue_work(c->btree_node_rewrite_worker, &a->work); 2366 } else if (pending) { 2367 /* bch2_do_pending_node_rewrites will execute */ 2368 } else { 2369 bch2_bkey_buf_exit(&a->key, c); 2370 kfree(a); 2371 } 2372 } 2373 2374 void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) 2375 { 2376 closure_wait_event(&c->btree_node_rewrites_wait, 2377 list_empty(&c->btree_node_rewrites)); 2378 } 2379 2380 void bch2_do_pending_node_rewrites(struct bch_fs *c) 2381 { 2382 while (1) { 2383 spin_lock(&c->btree_node_rewrites_lock); 2384 struct async_btree_rewrite *a = 2385 list_pop_entry(&c->btree_node_rewrites_pending, 2386 struct async_btree_rewrite, list); 2387 if (a) 2388 list_add(&a->list, &c->btree_node_rewrites); 2389 spin_unlock(&c->btree_node_rewrites_lock); 2390 2391 if (!a) 2392 break; 2393 2394 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); 2395 queue_work(c->btree_node_rewrite_worker, &a->work); 2396 } 2397 } 2398 2399 void bch2_free_pending_node_rewrites(struct bch_fs *c) 2400 { 2401 while (1) { 2402 spin_lock(&c->btree_node_rewrites_lock); 2403 struct async_btree_rewrite *a = 2404 list_pop_entry(&c->btree_node_rewrites_pending, 2405 struct async_btree_rewrite, list); 2406 spin_unlock(&c->btree_node_rewrites_lock); 2407 2408 if (!a) 2409 break; 2410 2411 bch2_bkey_buf_exit(&a->key, c); 2412 kfree(a); 2413 } 2414 } 2415 2416 static int __bch2_btree_node_update_key(struct btree_trans *trans, 2417 struct btree_iter *iter, 2418 struct btree *b, struct btree *new_hash, 2419 struct bkey_i *new_key, 2420 unsigned commit_flags, 2421 bool skip_triggers) 2422 { 2423 struct bch_fs *c = trans->c; 2424 struct btree_iter iter2 = {}; 2425 struct btree *parent; 2426 int ret; 2427 2428 if (!skip_triggers) { 2429 ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, 2430 bkey_i_to_s_c(&b->key), 2431 BTREE_TRIGGER_transactional) ?: 2432 bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, 2433 bkey_i_to_s(new_key), 2434 BTREE_TRIGGER_transactional); 2435 if (ret) 2436 return ret; 2437 } 2438 2439 if (new_hash) { 2440 bkey_copy(&new_hash->key, new_key); 2441 ret = bch2_btree_node_hash_insert(&c->btree_cache, 2442 new_hash, b->c.level, b->c.btree_id); 2443 BUG_ON(ret); 2444 } 2445 2446 parent = btree_node_parent(btree_iter_path(trans, iter), b); 2447 if (parent) { 2448 bch2_trans_copy_iter(trans, &iter2, iter); 2449 2450 iter2.path = bch2_btree_path_make_mut(trans, iter2.path, 2451 iter2.flags & BTREE_ITER_intent, 2452 _THIS_IP_); 2453 2454 struct btree_path *path2 = btree_iter_path(trans, &iter2); 2455 BUG_ON(path2->level != b->c.level); 2456 BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); 2457 2458 btree_path_set_level_up(trans, path2); 2459 2460 trans->paths_sorted = false; 2461 2462 ret = bch2_btree_iter_traverse(trans, &iter2) ?: 2463 bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); 2464 if (ret) 2465 goto err; 2466 } else { 2467 BUG_ON(btree_node_root(c, b) != b); 2468 2469 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, 2470 jset_u64s(new_key->k.u64s)); 2471 ret = PTR_ERR_OR_ZERO(e); 2472 if (ret) 2473 return ret; 2474 2475 journal_entry_set(e, 2476 BCH_JSET_ENTRY_btree_root, 2477 b->c.btree_id, b->c.level, 2478 new_key, new_key->k.u64s); 2479 } 2480 2481 ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); 2482 if (ret) 2483 goto err; 2484 2485 bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); 2486 2487 if (new_hash) { 2488 mutex_lock(&c->btree_cache.lock); 2489 bch2_btree_node_hash_remove(&c->btree_cache, new_hash); 2490 2491 __bch2_btree_node_hash_remove(&c->btree_cache, b); 2492 2493 bkey_copy(&b->key, new_key); 2494 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 2495 BUG_ON(ret); 2496 mutex_unlock(&c->btree_cache.lock); 2497 } else { 2498 bkey_copy(&b->key, new_key); 2499 } 2500 2501 bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); 2502 out: 2503 bch2_trans_iter_exit(trans, &iter2); 2504 return ret; 2505 err: 2506 if (new_hash) { 2507 mutex_lock(&c->btree_cache.lock); 2508 bch2_btree_node_hash_remove(&c->btree_cache, b); 2509 mutex_unlock(&c->btree_cache.lock); 2510 } 2511 goto out; 2512 } 2513 2514 int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, 2515 struct btree *b, struct bkey_i *new_key, 2516 unsigned commit_flags, bool skip_triggers) 2517 { 2518 struct bch_fs *c = trans->c; 2519 struct btree *new_hash = NULL; 2520 struct btree_path *path = btree_iter_path(trans, iter); 2521 struct closure cl; 2522 int ret = 0; 2523 2524 ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); 2525 if (ret) 2526 return ret; 2527 2528 closure_init_stack(&cl); 2529 2530 /* 2531 * check btree_ptr_hash_val() after @b is locked by 2532 * btree_iter_traverse(): 2533 */ 2534 if (btree_ptr_hash_val(new_key) != b->hash_val) { 2535 ret = bch2_btree_cache_cannibalize_lock(trans, &cl); 2536 if (ret) { 2537 ret = drop_locks_do(trans, (closure_sync(&cl), 0)); 2538 if (ret) 2539 return ret; 2540 } 2541 2542 new_hash = bch2_btree_node_mem_alloc(trans, false); 2543 ret = PTR_ERR_OR_ZERO(new_hash); 2544 if (ret) 2545 goto err; 2546 } 2547 2548 path->intent_ref++; 2549 ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, 2550 commit_flags, skip_triggers); 2551 --path->intent_ref; 2552 2553 if (new_hash) 2554 bch2_btree_node_to_freelist(c, new_hash); 2555 err: 2556 closure_sync(&cl); 2557 bch2_btree_cache_cannibalize_unlock(trans); 2558 return ret; 2559 } 2560 2561 int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, 2562 struct btree *b, struct bkey_i *new_key, 2563 unsigned commit_flags, bool skip_triggers) 2564 { 2565 struct btree_iter iter; 2566 int ret = get_iter_to_node(trans, &iter, b); 2567 if (ret) 2568 return ret == -BCH_ERR_btree_node_dying ? 0 : ret; 2569 2570 bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, 2571 !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); 2572 2573 ret = bch2_btree_node_update_key(trans, &iter, b, new_key, 2574 commit_flags, skip_triggers); 2575 bch2_trans_iter_exit(trans, &iter); 2576 return ret; 2577 } 2578 2579 /* Init code: */ 2580 2581 /* 2582 * Only for filesystem bringup, when first reading the btree roots or allocating 2583 * btree roots when initializing a new filesystem: 2584 */ 2585 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) 2586 { 2587 BUG_ON(btree_node_root(c, b)); 2588 2589 bch2_btree_set_root_inmem(c, b); 2590 } 2591 2592 int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) 2593 { 2594 struct bch_fs *c = trans->c; 2595 struct closure cl; 2596 struct btree *b; 2597 int ret; 2598 2599 closure_init_stack(&cl); 2600 2601 do { 2602 ret = bch2_btree_cache_cannibalize_lock(trans, &cl); 2603 closure_sync(&cl); 2604 } while (ret); 2605 2606 b = bch2_btree_node_mem_alloc(trans, false); 2607 bch2_btree_cache_cannibalize_unlock(trans); 2608 2609 ret = PTR_ERR_OR_ZERO(b); 2610 if (ret) 2611 return ret; 2612 2613 set_btree_node_fake(b); 2614 set_btree_node_need_rewrite(b); 2615 b->c.level = level; 2616 b->c.btree_id = id; 2617 2618 bkey_btree_ptr_init(&b->key); 2619 b->key.k.p = SPOS_MAX; 2620 *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; 2621 2622 bch2_bset_init_first(b, &b->data->keys); 2623 bch2_btree_build_aux_trees(b); 2624 2625 b->data->flags = 0; 2626 btree_set_min(b, POS_MIN); 2627 btree_set_max(b, SPOS_MAX); 2628 b->data->format = bch2_btree_calc_format(b); 2629 btree_node_set_format(b, b->data->format); 2630 2631 ret = bch2_btree_node_hash_insert(&c->btree_cache, b, 2632 b->c.level, b->c.btree_id); 2633 BUG_ON(ret); 2634 2635 bch2_btree_set_root_inmem(c, b); 2636 2637 six_unlock_write(&b->c.lock); 2638 six_unlock_intent(&b->c.lock); 2639 return 0; 2640 } 2641 2642 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) 2643 { 2644 bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); 2645 } 2646 2647 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) 2648 { 2649 prt_printf(out, "%ps: ", (void *) as->ip_started); 2650 bch2_trans_commit_flags_to_text(out, as->flags); 2651 2652 prt_str(out, " "); 2653 bch2_btree_id_to_text(out, as->btree_id); 2654 prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", 2655 as->update_level_start, 2656 as->update_level_end, 2657 bch2_btree_update_modes[as->mode], 2658 as->nodes_written, 2659 closure_nr_remaining(&as->cl), 2660 as->journal.seq); 2661 } 2662 2663 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) 2664 { 2665 struct btree_update *as; 2666 2667 mutex_lock(&c->btree_interior_update_lock); 2668 list_for_each_entry(as, &c->btree_interior_update_list, list) 2669 bch2_btree_update_to_text(out, as); 2670 mutex_unlock(&c->btree_interior_update_lock); 2671 } 2672 2673 static bool bch2_btree_interior_updates_pending(struct bch_fs *c) 2674 { 2675 bool ret; 2676 2677 mutex_lock(&c->btree_interior_update_lock); 2678 ret = !list_empty(&c->btree_interior_update_list); 2679 mutex_unlock(&c->btree_interior_update_lock); 2680 2681 return ret; 2682 } 2683 2684 bool bch2_btree_interior_updates_flush(struct bch_fs *c) 2685 { 2686 bool ret = bch2_btree_interior_updates_pending(c); 2687 2688 if (ret) 2689 closure_wait_event(&c->btree_interior_update_wait, 2690 !bch2_btree_interior_updates_pending(c)); 2691 return ret; 2692 } 2693 2694 void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) 2695 { 2696 struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); 2697 2698 mutex_lock(&c->btree_root_lock); 2699 2700 r->level = entry->level; 2701 r->alive = true; 2702 bkey_copy(&r->key, (struct bkey_i *) entry->start); 2703 2704 mutex_unlock(&c->btree_root_lock); 2705 } 2706 2707 struct jset_entry * 2708 bch2_btree_roots_to_journal_entries(struct bch_fs *c, 2709 struct jset_entry *end, 2710 unsigned long skip) 2711 { 2712 unsigned i; 2713 2714 mutex_lock(&c->btree_root_lock); 2715 2716 for (i = 0; i < btree_id_nr_alive(c); i++) { 2717 struct btree_root *r = bch2_btree_id_root(c, i); 2718 2719 if (r->alive && !test_bit(i, &skip)) { 2720 journal_entry_set(end, BCH_JSET_ENTRY_btree_root, 2721 i, r->level, &r->key, r->key.k.u64s); 2722 end = vstruct_next(end); 2723 } 2724 } 2725 2726 mutex_unlock(&c->btree_root_lock); 2727 2728 return end; 2729 } 2730 2731 static void bch2_btree_alloc_to_text(struct printbuf *out, 2732 struct bch_fs *c, 2733 struct btree_alloc *a) 2734 { 2735 printbuf_indent_add(out, 2); 2736 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); 2737 prt_newline(out); 2738 2739 struct open_bucket *ob; 2740 unsigned i; 2741 open_bucket_for_each(c, &a->ob, ob, i) 2742 bch2_open_bucket_to_text(out, c, ob); 2743 2744 printbuf_indent_sub(out, 2); 2745 } 2746 2747 void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) 2748 { 2749 for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) 2750 bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); 2751 } 2752 2753 void bch2_fs_btree_interior_update_exit(struct bch_fs *c) 2754 { 2755 WARN_ON(!list_empty(&c->btree_node_rewrites)); 2756 WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); 2757 2758 if (c->btree_node_rewrite_worker) 2759 destroy_workqueue(c->btree_node_rewrite_worker); 2760 if (c->btree_interior_update_worker) 2761 destroy_workqueue(c->btree_interior_update_worker); 2762 mempool_exit(&c->btree_interior_update_pool); 2763 } 2764 2765 void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) 2766 { 2767 mutex_init(&c->btree_reserve_cache_lock); 2768 INIT_LIST_HEAD(&c->btree_interior_update_list); 2769 INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); 2770 mutex_init(&c->btree_interior_update_lock); 2771 INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); 2772 2773 INIT_LIST_HEAD(&c->btree_node_rewrites); 2774 INIT_LIST_HEAD(&c->btree_node_rewrites_pending); 2775 spin_lock_init(&c->btree_node_rewrites_lock); 2776 } 2777 2778 int bch2_fs_btree_interior_update_init(struct bch_fs *c) 2779 { 2780 c->btree_interior_update_worker = 2781 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); 2782 if (!c->btree_interior_update_worker) 2783 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; 2784 2785 c->btree_node_rewrite_worker = 2786 alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); 2787 if (!c->btree_node_rewrite_worker) 2788 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; 2789 2790 if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, 2791 sizeof(struct btree_update))) 2792 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; 2793 2794 return 0; 2795 } 2796