1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "btree_cache.h" 5 #include "btree_iter.h" 6 #include "btree_key_cache.h" 7 #include "btree_locking.h" 8 #include "btree_update.h" 9 #include "errcode.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_reclaim.h" 13 #include "trace.h" 14 15 #include <linux/sched/mm.h> 16 17 static inline bool btree_uses_pcpu_readers(enum btree_id id) 18 { 19 return id == BTREE_ID_subvolumes; 20 } 21 22 static struct kmem_cache *bch2_key_cache; 23 24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, 25 const void *obj) 26 { 27 const struct bkey_cached *ck = obj; 28 const struct bkey_cached_key *key = arg->key; 29 30 return ck->key.btree_id != key->btree_id || 31 !bpos_eq(ck->key.pos, key->pos); 32 } 33 34 static const struct rhashtable_params bch2_btree_key_cache_params = { 35 .head_offset = offsetof(struct bkey_cached, hash), 36 .key_offset = offsetof(struct bkey_cached, key), 37 .key_len = sizeof(struct bkey_cached_key), 38 .obj_cmpfn = bch2_btree_key_cache_cmp_fn, 39 .automatic_shrinking = true, 40 }; 41 42 static inline void btree_path_cached_set(struct btree_trans *trans, struct btree_path *path, 43 struct bkey_cached *ck, 44 enum btree_node_locked_type lock_held) 45 { 46 path->l[0].lock_seq = six_lock_seq(&ck->c.lock); 47 path->l[0].b = (void *) ck; 48 mark_btree_node_locked(trans, path, 0, lock_held); 49 } 50 51 __flatten 52 inline struct bkey_cached * 53 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) 54 { 55 struct bkey_cached_key key = { 56 .btree_id = btree_id, 57 .pos = pos, 58 }; 59 60 return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, 61 bch2_btree_key_cache_params); 62 } 63 64 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) 65 { 66 if (!six_trylock_intent(&ck->c.lock)) 67 return false; 68 69 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 70 six_unlock_intent(&ck->c.lock); 71 return false; 72 } 73 74 if (!six_trylock_write(&ck->c.lock)) { 75 six_unlock_intent(&ck->c.lock); 76 return false; 77 } 78 79 return true; 80 } 81 82 static bool bkey_cached_evict(struct btree_key_cache *c, 83 struct bkey_cached *ck) 84 { 85 bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, 86 bch2_btree_key_cache_params); 87 if (ret) { 88 memset(&ck->key, ~0, sizeof(ck->key)); 89 atomic_long_dec(&c->nr_keys); 90 } 91 92 return ret; 93 } 94 95 static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) 96 { 97 struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); 98 struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); 99 100 this_cpu_dec(*c->btree_key_cache.nr_pending); 101 kmem_cache_free(bch2_key_cache, ck); 102 } 103 104 static void bkey_cached_free(struct btree_key_cache *bc, 105 struct bkey_cached *ck) 106 { 107 kfree(ck->k); 108 ck->k = NULL; 109 ck->u64s = 0; 110 111 six_unlock_write(&ck->c.lock); 112 six_unlock_intent(&ck->c.lock); 113 114 bool pcpu_readers = ck->c.lock.readers != NULL; 115 rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); 116 this_cpu_inc(*bc->nr_pending); 117 } 118 119 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) 120 { 121 gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; 122 123 struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); 124 if (unlikely(!ck)) 125 return NULL; 126 ck->k = kmalloc(key_u64s * sizeof(u64), gfp); 127 if (unlikely(!ck->k)) { 128 kmem_cache_free(bch2_key_cache, ck); 129 return NULL; 130 } 131 ck->u64s = key_u64s; 132 return ck; 133 } 134 135 static struct bkey_cached * 136 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) 137 { 138 struct bch_fs *c = trans->c; 139 struct btree_key_cache *bc = &c->btree_key_cache; 140 bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); 141 int ret; 142 143 struct bkey_cached *ck = container_of_or_null( 144 rcu_pending_dequeue(&bc->pending[pcpu_readers]), 145 struct bkey_cached, rcu); 146 if (ck) 147 goto lock; 148 149 ck = allocate_dropping_locks(trans, ret, 150 __bkey_cached_alloc(key_u64s, _gfp)); 151 if (ret) { 152 if (ck) 153 kfree(ck->k); 154 kmem_cache_free(bch2_key_cache, ck); 155 return ERR_PTR(ret); 156 } 157 158 if (ck) { 159 bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); 160 ck->c.cached = true; 161 goto lock; 162 } 163 164 ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), 165 struct bkey_cached, rcu); 166 if (ck) 167 goto lock; 168 lock: 169 six_lock_intent(&ck->c.lock, NULL, NULL); 170 six_lock_write(&ck->c.lock, NULL, NULL); 171 return ck; 172 } 173 174 static struct bkey_cached * 175 bkey_cached_reuse(struct btree_key_cache *c) 176 { 177 struct bucket_table *tbl; 178 struct rhash_head *pos; 179 struct bkey_cached *ck; 180 unsigned i; 181 182 rcu_read_lock(); 183 tbl = rht_dereference_rcu(c->table.tbl, &c->table); 184 for (i = 0; i < tbl->size; i++) 185 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { 186 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && 187 bkey_cached_lock_for_evict(ck)) { 188 if (bkey_cached_evict(c, ck)) 189 goto out; 190 six_unlock_write(&ck->c.lock); 191 six_unlock_intent(&ck->c.lock); 192 } 193 } 194 ck = NULL; 195 out: 196 rcu_read_unlock(); 197 return ck; 198 } 199 200 static int btree_key_cache_create(struct btree_trans *trans, 201 struct btree_path *path, 202 struct btree_path *ck_path, 203 struct bkey_s_c k) 204 { 205 struct bch_fs *c = trans->c; 206 struct btree_key_cache *bc = &c->btree_key_cache; 207 208 /* 209 * bch2_varint_decode can read past the end of the buffer by at 210 * most 7 bytes (it won't be used): 211 */ 212 unsigned key_u64s = k.k->u64s + 1; 213 214 /* 215 * Allocate some extra space so that the transaction commit path is less 216 * likely to have to reallocate, since that requires a transaction 217 * restart: 218 */ 219 key_u64s = min(256U, (key_u64s * 3) / 2); 220 key_u64s = roundup_pow_of_two(key_u64s); 221 222 struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); 223 int ret = PTR_ERR_OR_ZERO(ck); 224 if (ret) 225 return ret; 226 227 if (unlikely(!ck)) { 228 ck = bkey_cached_reuse(bc); 229 if (unlikely(!ck)) { 230 bch_err(c, "error allocating memory for key cache item, btree %s", 231 bch2_btree_id_str(ck_path->btree_id)); 232 return -BCH_ERR_ENOMEM_btree_key_cache_create; 233 } 234 } 235 236 ck->c.level = 0; 237 ck->c.btree_id = ck_path->btree_id; 238 ck->key.btree_id = ck_path->btree_id; 239 ck->key.pos = ck_path->pos; 240 ck->flags = 1U << BKEY_CACHED_ACCESSED; 241 242 if (unlikely(key_u64s > ck->u64s)) { 243 mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); 244 245 struct bkey_i *new_k = allocate_dropping_locks(trans, ret, 246 kmalloc(key_u64s * sizeof(u64), _gfp)); 247 if (unlikely(!new_k)) { 248 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", 249 bch2_btree_id_str(ck->key.btree_id), key_u64s); 250 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; 251 } else if (ret) { 252 kfree(new_k); 253 goto err; 254 } 255 256 kfree(ck->k); 257 ck->k = new_k; 258 ck->u64s = key_u64s; 259 } 260 261 bkey_reassemble(ck->k, k); 262 263 ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); 264 if (unlikely(ret)) 265 goto err; 266 267 ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); 268 269 bch2_btree_node_unlock_write(trans, path, path_l(path)->b); 270 271 if (unlikely(ret)) /* raced with another fill? */ 272 goto err; 273 274 atomic_long_inc(&bc->nr_keys); 275 six_unlock_write(&ck->c.lock); 276 277 enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); 278 if (lock_want == SIX_LOCK_read) 279 six_lock_downgrade(&ck->c.lock); 280 btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); 281 ck_path->uptodate = BTREE_ITER_UPTODATE; 282 return 0; 283 err: 284 bkey_cached_free(bc, ck); 285 mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); 286 287 return ret; 288 } 289 290 static noinline int btree_key_cache_fill(struct btree_trans *trans, 291 struct btree_path *ck_path, 292 unsigned flags) 293 { 294 if (flags & BTREE_ITER_cached_nofill) { 295 ck_path->l[0].b = NULL; 296 return 0; 297 } 298 299 struct bch_fs *c = trans->c; 300 struct btree_iter iter; 301 struct bkey_s_c k; 302 int ret; 303 304 bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, 305 BTREE_ITER_intent| 306 BTREE_ITER_key_cache_fill| 307 BTREE_ITER_cached_nofill); 308 iter.flags &= ~BTREE_ITER_with_journal; 309 k = bch2_btree_iter_peek_slot(&iter); 310 ret = bkey_err(k); 311 if (ret) 312 goto err; 313 314 /* Recheck after btree lookup, before allocating: */ 315 ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; 316 if (unlikely(ret)) 317 goto out; 318 319 ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); 320 if (ret) 321 goto err; 322 323 if (trace_key_cache_fill_enabled()) { 324 struct printbuf buf = PRINTBUF; 325 326 bch2_bpos_to_text(&buf, ck_path->pos); 327 prt_char(&buf, ' '); 328 bch2_bkey_val_to_text(&buf, trans->c, k); 329 trace_key_cache_fill(trans, buf.buf); 330 printbuf_exit(&buf); 331 } 332 out: 333 /* We're not likely to need this iterator again: */ 334 bch2_set_btree_iter_dontneed(&iter); 335 err: 336 bch2_trans_iter_exit(trans, &iter); 337 return ret; 338 } 339 340 static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, 341 struct btree_path *path) 342 { 343 struct bch_fs *c = trans->c; 344 struct bkey_cached *ck; 345 retry: 346 ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); 347 if (!ck) 348 return -ENOENT; 349 350 enum six_lock_type lock_want = __btree_lock_want(path, 0); 351 352 int ret = btree_node_lock(trans, path, (void *) ck, 0, lock_want, _THIS_IP_); 353 if (ret) 354 return ret; 355 356 if (ck->key.btree_id != path->btree_id || 357 !bpos_eq(ck->key.pos, path->pos)) { 358 six_unlock_type(&ck->c.lock, lock_want); 359 goto retry; 360 } 361 362 if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) 363 set_bit(BKEY_CACHED_ACCESSED, &ck->flags); 364 365 btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); 366 path->uptodate = BTREE_ITER_UPTODATE; 367 return 0; 368 } 369 370 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, 371 unsigned flags) 372 { 373 EBUG_ON(path->level); 374 375 path->l[1].b = NULL; 376 377 int ret; 378 do { 379 ret = btree_path_traverse_cached_fast(trans, path); 380 if (unlikely(ret == -ENOENT)) 381 ret = btree_key_cache_fill(trans, path, flags); 382 } while (ret == -EEXIST); 383 384 if (unlikely(ret)) { 385 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 386 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 387 btree_node_unlock(trans, path, 0); 388 path->l[0].b = ERR_PTR(ret); 389 } 390 } 391 return ret; 392 } 393 394 static int btree_key_cache_flush_pos(struct btree_trans *trans, 395 struct bkey_cached_key key, 396 u64 journal_seq, 397 unsigned commit_flags, 398 bool evict) 399 { 400 struct bch_fs *c = trans->c; 401 struct journal *j = &c->journal; 402 struct btree_iter c_iter, b_iter; 403 struct bkey_cached *ck = NULL; 404 int ret; 405 406 bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, 407 BTREE_ITER_slots| 408 BTREE_ITER_intent| 409 BTREE_ITER_all_snapshots); 410 bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, 411 BTREE_ITER_cached| 412 BTREE_ITER_intent); 413 b_iter.flags &= ~BTREE_ITER_with_key_cache; 414 415 ret = bch2_btree_iter_traverse(&c_iter); 416 if (ret) 417 goto out; 418 419 ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; 420 if (!ck) 421 goto out; 422 423 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 424 if (evict) 425 goto evict; 426 goto out; 427 } 428 429 if (journal_seq && ck->journal.seq != journal_seq) 430 goto out; 431 432 trans->journal_res.seq = ck->journal.seq; 433 434 /* 435 * If we're at the end of the journal, we really want to free up space 436 * in the journal right away - we don't want to pin that old journal 437 * sequence number with a new btree node write, we want to re-journal 438 * the update 439 */ 440 if (ck->journal.seq == journal_last_seq(j)) 441 commit_flags |= BCH_WATERMARK_reclaim; 442 443 if (ck->journal.seq != journal_last_seq(j) || 444 !test_bit(JOURNAL_space_low, &c->journal.flags)) 445 commit_flags |= BCH_TRANS_COMMIT_no_journal_res; 446 447 struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); 448 ret = bkey_err(btree_k); 449 if (ret) 450 goto err; 451 452 /* * Check that we're not violating cache coherency rules: */ 453 BUG_ON(bkey_deleted(btree_k.k)); 454 455 ret = bch2_trans_update(trans, &b_iter, ck->k, 456 BTREE_UPDATE_key_cache_reclaim| 457 BTREE_UPDATE_internal_snapshot_node| 458 BTREE_TRIGGER_norun) ?: 459 bch2_trans_commit(trans, NULL, NULL, 460 BCH_TRANS_COMMIT_no_check_rw| 461 BCH_TRANS_COMMIT_no_enospc| 462 commit_flags); 463 err: 464 bch2_fs_fatal_err_on(ret && 465 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 466 !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && 467 !bch2_journal_error(j), c, 468 "flushing key cache: %s", bch2_err_str(ret)); 469 if (ret) 470 goto out; 471 472 bch2_journal_pin_drop(j, &ck->journal); 473 474 struct btree_path *path = btree_iter_path(trans, &c_iter); 475 BUG_ON(!btree_node_locked(path, 0)); 476 477 if (!evict) { 478 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 479 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 480 atomic_long_dec(&c->btree_key_cache.nr_dirty); 481 } 482 } else { 483 struct btree_path *path2; 484 unsigned i; 485 evict: 486 trans_for_each_path(trans, path2, i) 487 if (path2 != path) 488 __bch2_btree_path_unlock(trans, path2); 489 490 bch2_btree_node_lock_write_nofail(trans, path, &ck->c); 491 492 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 493 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 494 atomic_long_dec(&c->btree_key_cache.nr_dirty); 495 } 496 497 mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); 498 if (bkey_cached_evict(&c->btree_key_cache, ck)) { 499 bkey_cached_free(&c->btree_key_cache, ck); 500 } else { 501 six_unlock_write(&ck->c.lock); 502 six_unlock_intent(&ck->c.lock); 503 } 504 } 505 out: 506 bch2_trans_iter_exit(trans, &b_iter); 507 bch2_trans_iter_exit(trans, &c_iter); 508 return ret; 509 } 510 511 int bch2_btree_key_cache_journal_flush(struct journal *j, 512 struct journal_entry_pin *pin, u64 seq) 513 { 514 struct bch_fs *c = container_of(j, struct bch_fs, journal); 515 struct bkey_cached *ck = 516 container_of(pin, struct bkey_cached, journal); 517 struct bkey_cached_key key; 518 struct btree_trans *trans = bch2_trans_get(c); 519 int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 520 int ret = 0; 521 522 btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); 523 key = ck->key; 524 525 if (ck->journal.seq != seq || 526 !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 527 six_unlock_read(&ck->c.lock); 528 goto unlock; 529 } 530 531 if (ck->seq != seq) { 532 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, 533 bch2_btree_key_cache_journal_flush); 534 six_unlock_read(&ck->c.lock); 535 goto unlock; 536 } 537 six_unlock_read(&ck->c.lock); 538 539 ret = lockrestart_do(trans, 540 btree_key_cache_flush_pos(trans, key, seq, 541 BCH_TRANS_COMMIT_journal_reclaim, false)); 542 unlock: 543 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 544 545 bch2_trans_put(trans); 546 return ret; 547 } 548 549 bool bch2_btree_insert_key_cached(struct btree_trans *trans, 550 unsigned flags, 551 struct btree_insert_entry *insert_entry) 552 { 553 struct bch_fs *c = trans->c; 554 struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; 555 struct bkey_i *insert = insert_entry->k; 556 bool kick_reclaim = false; 557 558 BUG_ON(insert->k.u64s > ck->u64s); 559 560 bkey_copy(ck->k, insert); 561 562 if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 563 EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); 564 set_bit(BKEY_CACHED_DIRTY, &ck->flags); 565 atomic_long_inc(&c->btree_key_cache.nr_dirty); 566 567 if (bch2_nr_btree_keys_need_flush(c)) 568 kick_reclaim = true; 569 } 570 571 /* 572 * To minimize lock contention, we only add the journal pin here and 573 * defer pin updates to the flush callback via ->seq. Be careful not to 574 * update ->seq on nojournal commits because we don't want to update the 575 * pin to a seq that doesn't include journal updates on disk. Otherwise 576 * we risk losing the update after a crash. 577 * 578 * The only exception is if the pin is not active in the first place. We 579 * have to add the pin because journal reclaim drives key cache 580 * flushing. The flush callback will not proceed unless ->seq matches 581 * the latest pin, so make sure it starts with a consistent value. 582 */ 583 if (!(insert_entry->flags & BTREE_UPDATE_nojournal) || 584 !journal_pin_active(&ck->journal)) { 585 ck->seq = trans->journal_res.seq; 586 } 587 bch2_journal_pin_add(&c->journal, trans->journal_res.seq, 588 &ck->journal, bch2_btree_key_cache_journal_flush); 589 590 if (kick_reclaim) 591 journal_reclaim_kick(&c->journal); 592 return true; 593 } 594 595 void bch2_btree_key_cache_drop(struct btree_trans *trans, 596 struct btree_path *path) 597 { 598 struct bch_fs *c = trans->c; 599 struct btree_key_cache *bc = &c->btree_key_cache; 600 struct bkey_cached *ck = (void *) path->l[0].b; 601 602 /* 603 * We just did an update to the btree, bypassing the key cache: the key 604 * cache key is now stale and must be dropped, even if dirty: 605 */ 606 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 607 clear_bit(BKEY_CACHED_DIRTY, &ck->flags); 608 atomic_long_dec(&c->btree_key_cache.nr_dirty); 609 bch2_journal_pin_drop(&c->journal, &ck->journal); 610 } 611 612 bkey_cached_evict(bc, ck); 613 bkey_cached_free(bc, ck); 614 615 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); 616 617 struct btree_path *path2; 618 unsigned i; 619 trans_for_each_path(trans, path2, i) 620 if (path2->l[0].b == (void *) ck) { 621 __bch2_btree_path_unlock(trans, path2); 622 path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); 623 path2->should_be_locked = false; 624 btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); 625 } 626 627 bch2_trans_verify_locks(trans); 628 } 629 630 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, 631 struct shrink_control *sc) 632 { 633 struct bch_fs *c = shrink->private_data; 634 struct btree_key_cache *bc = &c->btree_key_cache; 635 struct bucket_table *tbl; 636 struct bkey_cached *ck; 637 size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; 638 unsigned iter, start; 639 int srcu_idx; 640 641 srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 642 rcu_read_lock(); 643 644 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 645 646 /* 647 * Scanning is expensive while a rehash is in progress - most elements 648 * will be on the new hashtable, if it's in progress 649 * 650 * A rehash could still start while we're scanning - that's ok, we'll 651 * still see most elements. 652 */ 653 if (unlikely(tbl->nest)) { 654 rcu_read_unlock(); 655 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 656 return SHRINK_STOP; 657 } 658 659 iter = bc->shrink_iter; 660 if (iter >= tbl->size) 661 iter = 0; 662 start = iter; 663 664 do { 665 struct rhash_head *pos, *next; 666 667 pos = rht_ptr_rcu(&tbl->buckets[iter]); 668 669 while (!rht_is_a_nulls(pos)) { 670 next = rht_dereference_bucket_rcu(pos->next, tbl, iter); 671 ck = container_of(pos, struct bkey_cached, hash); 672 673 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { 674 bc->skipped_dirty++; 675 } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { 676 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); 677 bc->skipped_accessed++; 678 } else if (!bkey_cached_lock_for_evict(ck)) { 679 bc->skipped_lock_fail++; 680 } else if (bkey_cached_evict(bc, ck)) { 681 bkey_cached_free(bc, ck); 682 bc->freed++; 683 freed++; 684 } else { 685 six_unlock_write(&ck->c.lock); 686 six_unlock_intent(&ck->c.lock); 687 } 688 689 scanned++; 690 if (scanned >= nr) 691 goto out; 692 693 pos = next; 694 } 695 696 iter++; 697 if (iter >= tbl->size) 698 iter = 0; 699 } while (scanned < nr && iter != start); 700 out: 701 bc->shrink_iter = iter; 702 703 rcu_read_unlock(); 704 srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 705 706 return freed; 707 } 708 709 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, 710 struct shrink_control *sc) 711 { 712 struct bch_fs *c = shrink->private_data; 713 struct btree_key_cache *bc = &c->btree_key_cache; 714 long nr = atomic_long_read(&bc->nr_keys) - 715 atomic_long_read(&bc->nr_dirty); 716 717 /* 718 * Avoid hammering our shrinker too much if it's nearly empty - the 719 * shrinker code doesn't take into account how big our cache is, if it's 720 * mostly empty but the system is under memory pressure it causes nasty 721 * lock contention: 722 */ 723 nr -= 128; 724 725 return max(0L, nr); 726 } 727 728 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) 729 { 730 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 731 struct bucket_table *tbl; 732 struct bkey_cached *ck; 733 struct rhash_head *pos; 734 LIST_HEAD(items); 735 unsigned i; 736 737 shrinker_free(bc->shrink); 738 739 /* 740 * The loop is needed to guard against racing with rehash: 741 */ 742 while (atomic_long_read(&bc->nr_keys)) { 743 rcu_read_lock(); 744 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); 745 if (tbl) { 746 if (tbl->nest) { 747 /* wait for in progress rehash */ 748 rcu_read_unlock(); 749 mutex_lock(&bc->table.mutex); 750 mutex_unlock(&bc->table.mutex); 751 rcu_read_lock(); 752 continue; 753 } 754 for (i = 0; i < tbl->size; i++) 755 while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { 756 ck = container_of(pos, struct bkey_cached, hash); 757 BUG_ON(!bkey_cached_evict(bc, ck)); 758 kfree(ck->k); 759 kmem_cache_free(bch2_key_cache, ck); 760 } 761 } 762 rcu_read_unlock(); 763 } 764 765 if (atomic_long_read(&bc->nr_dirty) && 766 !bch2_journal_error(&c->journal) && 767 test_bit(BCH_FS_was_rw, &c->flags)) 768 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", 769 atomic_long_read(&bc->nr_dirty)); 770 771 if (atomic_long_read(&bc->nr_keys)) 772 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", 773 atomic_long_read(&bc->nr_keys)); 774 775 if (bc->table_init_done) 776 rhashtable_destroy(&bc->table); 777 778 rcu_pending_exit(&bc->pending[0]); 779 rcu_pending_exit(&bc->pending[1]); 780 781 free_percpu(bc->nr_pending); 782 } 783 784 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) 785 { 786 } 787 788 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) 789 { 790 struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); 791 struct shrinker *shrink; 792 793 bc->nr_pending = alloc_percpu(size_t); 794 if (!bc->nr_pending) 795 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 796 797 if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || 798 rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) 799 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 800 801 if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) 802 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 803 804 bc->table_init_done = true; 805 806 shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); 807 if (!shrink) 808 return -BCH_ERR_ENOMEM_fs_btree_cache_init; 809 bc->shrink = shrink; 810 shrink->count_objects = bch2_btree_key_cache_count; 811 shrink->scan_objects = bch2_btree_key_cache_scan; 812 shrink->batch = 1 << 14; 813 shrink->seeks = 0; 814 shrink->private_data = c; 815 shrinker_register(shrink); 816 return 0; 817 } 818 819 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) 820 { 821 printbuf_tabstop_push(out, 24); 822 printbuf_tabstop_push(out, 12); 823 824 prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); 825 prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); 826 prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); 827 prt_newline(out); 828 prt_printf(out, "shrinker:\n"); 829 prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); 830 prt_printf(out, "freed:\t%lu\r\n", bc->freed); 831 prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); 832 prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); 833 prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); 834 prt_newline(out); 835 prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); 836 } 837 838 void bch2_btree_key_cache_exit(void) 839 { 840 kmem_cache_destroy(bch2_key_cache); 841 } 842 843 int __init bch2_btree_key_cache_init(void) 844 { 845 bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); 846 if (!bch2_key_cache) 847 return -ENOMEM; 848 849 return 0; 850 } 851