1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_foreground.h" 5 #include "bkey_buf.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "compress.h" 9 #include "data_update.h" 10 #include "disk_groups.h" 11 #include "ec.h" 12 #include "error.h" 13 #include "extents.h" 14 #include "io_write.h" 15 #include "keylist.h" 16 #include "move.h" 17 #include "nocow_locking.h" 18 #include "rebalance.h" 19 #include "snapshot.h" 20 #include "subvolume.h" 21 #include "trace.h" 22 23 #include <linux/ioprio.h> 24 25 static const char * const bch2_data_update_type_strs[] = { 26 #define x(t, n, ...) [n] = #t, 27 BCH_DATA_UPDATE_TYPES() 28 #undef x 29 NULL 30 }; 31 32 static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) 33 { 34 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 35 36 bkey_for_each_ptr(ptrs, ptr) 37 bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); 38 } 39 40 static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) 41 { 42 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 43 44 bkey_for_each_ptr(ptrs, ptr) { 45 if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { 46 bkey_for_each_ptr(ptrs, ptr2) { 47 if (ptr2 == ptr) 48 break; 49 bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); 50 } 51 return false; 52 } 53 } 54 return true; 55 } 56 57 static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) 58 { 59 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 60 61 bkey_for_each_ptr(ptrs, ptr) { 62 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 63 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 64 65 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 66 } 67 } 68 69 static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) 70 { 71 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 72 73 bkey_for_each_ptr(ptrs, ptr) { 74 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 75 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 76 77 if (ctxt) { 78 bool locked; 79 80 move_ctxt_wait_event(ctxt, 81 (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || 82 list_empty(&ctxt->ios)); 83 84 if (!locked) 85 bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); 86 } else { 87 if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { 88 bkey_for_each_ptr(ptrs, ptr2) { 89 if (ptr2 == ptr) 90 break; 91 92 ca = bch2_dev_have_ref(c, ptr2->dev); 93 bucket = PTR_BUCKET_POS(ca, ptr2); 94 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 95 } 96 return false; 97 } 98 } 99 } 100 return true; 101 } 102 103 static noinline void trace_io_move_finish2(struct data_update *u, 104 struct bkey_i *new, 105 struct bkey_i *insert) 106 { 107 struct bch_fs *c = u->op.c; 108 struct printbuf buf = PRINTBUF; 109 110 prt_newline(&buf); 111 112 bch2_data_update_to_text(&buf, u); 113 prt_newline(&buf); 114 115 prt_str_indented(&buf, "new replicas:\t"); 116 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 117 prt_newline(&buf); 118 119 prt_str_indented(&buf, "insert:\t"); 120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 121 prt_newline(&buf); 122 123 trace_io_move_finish(c, buf.buf); 124 printbuf_exit(&buf); 125 } 126 127 static void trace_io_move_fail2(struct data_update *m, 128 struct bkey_s_c new, 129 struct bkey_s_c wrote, 130 struct bkey_i *insert, 131 const char *msg) 132 { 133 struct bch_fs *c = m->op.c; 134 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 135 struct printbuf buf = PRINTBUF; 136 unsigned rewrites_found = 0; 137 138 if (!trace_io_move_fail_enabled()) 139 return; 140 141 prt_str(&buf, msg); 142 143 if (insert) { 144 const union bch_extent_entry *entry; 145 struct bch_extent_ptr *ptr; 146 struct extent_ptr_decoded p; 147 148 unsigned ptr_bit = 1; 149 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { 150 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 151 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 152 !ptr->cached) 153 rewrites_found |= ptr_bit; 154 ptr_bit <<= 1; 155 } 156 } 157 158 prt_str(&buf, "rewrites found:\t"); 159 bch2_prt_u64_base2(&buf, rewrites_found); 160 prt_newline(&buf); 161 162 bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); 163 164 prt_str(&buf, "\nold: "); 165 bch2_bkey_val_to_text(&buf, c, old); 166 167 prt_str(&buf, "\nnew: "); 168 bch2_bkey_val_to_text(&buf, c, new); 169 170 prt_str(&buf, "\nwrote: "); 171 bch2_bkey_val_to_text(&buf, c, wrote); 172 173 if (insert) { 174 prt_str(&buf, "\ninsert: "); 175 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 176 } 177 178 trace_io_move_fail(c, buf.buf); 179 printbuf_exit(&buf); 180 } 181 182 static int __bch2_data_update_index_update(struct btree_trans *trans, 183 struct bch_write_op *op) 184 { 185 struct bch_fs *c = op->c; 186 struct btree_iter iter; 187 struct data_update *m = 188 container_of(op, struct data_update, op); 189 struct keylist *keys = &op->insert_keys; 190 struct bkey_buf _new, _insert; 191 struct printbuf journal_msg = PRINTBUF; 192 int ret = 0; 193 194 bch2_bkey_buf_init(&_new); 195 bch2_bkey_buf_init(&_insert); 196 bch2_bkey_buf_realloc(&_insert, c, U8_MAX); 197 198 bch2_trans_iter_init(trans, &iter, m->btree_id, 199 bkey_start_pos(&bch2_keylist_front(keys)->k), 200 BTREE_ITER_slots|BTREE_ITER_intent); 201 202 while (1) { 203 struct bkey_s_c k; 204 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 205 struct bkey_i *insert = NULL; 206 struct bkey_i_extent *new; 207 const union bch_extent_entry *entry_c; 208 union bch_extent_entry *entry; 209 struct extent_ptr_decoded p; 210 struct bch_extent_ptr *ptr; 211 const struct bch_extent_ptr *ptr_c; 212 struct bpos next_pos; 213 bool should_check_enospc; 214 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 215 unsigned rewrites_found = 0, durability, ptr_bit; 216 217 bch2_trans_begin(trans); 218 219 k = bch2_btree_iter_peek_slot(trans, &iter); 220 ret = bkey_err(k); 221 if (ret) 222 goto err; 223 224 new = bkey_i_to_extent(bch2_keylist_front(keys)); 225 226 if (!bch2_extents_match(k, old)) { 227 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), 228 NULL, "no match:"); 229 goto nowork; 230 } 231 232 bkey_reassemble(_insert.k, k); 233 insert = _insert.k; 234 235 bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); 236 new = bkey_i_to_extent(_new.k); 237 bch2_cut_front(iter.pos, &new->k_i); 238 239 bch2_cut_front(iter.pos, insert); 240 bch2_cut_back(new->k.p, insert); 241 bch2_cut_back(insert->k.p, &new->k_i); 242 243 /* 244 * @old: extent that we read from 245 * @insert: key that we're going to update, initialized from 246 * extent currently in btree - same as @old unless we raced with 247 * other updates 248 * @new: extent with new pointers that we'll be adding to @insert 249 * 250 * Fist, drop rewrite_ptrs from @new: 251 */ 252 ptr_bit = 1; 253 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { 254 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 255 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 256 !ptr->cached) { 257 bch2_extent_ptr_set_cached(c, &m->op.opts, 258 bkey_i_to_s(insert), ptr); 259 rewrites_found |= ptr_bit; 260 } 261 ptr_bit <<= 1; 262 } 263 264 if (m->data_opts.rewrite_ptrs && 265 !rewrites_found && 266 bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { 267 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); 268 goto nowork; 269 } 270 271 /* 272 * A replica that we just wrote might conflict with a replica 273 * that we want to keep, due to racing with another move: 274 */ 275 restart_drop_conflicting_replicas: 276 extent_for_each_ptr(extent_i_to_s(new), ptr) 277 if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && 278 !ptr_c->cached) { 279 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); 280 goto restart_drop_conflicting_replicas; 281 } 282 283 if (!bkey_val_u64s(&new->k)) { 284 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); 285 goto nowork; 286 } 287 288 /* Now, drop pointers that conflict with what we just wrote: */ 289 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 290 if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) 291 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); 292 293 durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + 294 bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); 295 296 /* Now, drop excess replicas: */ 297 rcu_read_lock(); 298 restart_drop_extra_replicas: 299 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { 300 unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); 301 302 if (!p.ptr.cached && 303 durability - ptr_durability >= m->op.opts.data_replicas) { 304 durability -= ptr_durability; 305 306 bch2_extent_ptr_set_cached(c, &m->op.opts, 307 bkey_i_to_s(insert), &entry->ptr); 308 goto restart_drop_extra_replicas; 309 } 310 } 311 rcu_read_unlock(); 312 313 /* Finally, add the pointers we just wrote: */ 314 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 315 bch2_extent_ptr_decoded_append(insert, &p); 316 317 bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); 318 bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); 319 320 ret = bch2_sum_sector_overwrites(trans, &iter, insert, 321 &should_check_enospc, 322 &i_sectors_delta, 323 &disk_sectors_delta); 324 if (ret) 325 goto err; 326 327 if (disk_sectors_delta > (s64) op->res.sectors) { 328 ret = bch2_disk_reservation_add(c, &op->res, 329 disk_sectors_delta - op->res.sectors, 330 !should_check_enospc 331 ? BCH_DISK_RESERVATION_NOFAIL : 0); 332 if (ret) 333 goto out; 334 } 335 336 next_pos = insert->k.p; 337 338 /* 339 * Check for nonce offset inconsistency: 340 * This is debug code - we've been seeing this bug rarely, and 341 * it's been hard to reproduce, so this should give us some more 342 * information when it does occur: 343 */ 344 int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), 345 (struct bkey_validate_context) { 346 .btree = m->btree_id, 347 .flags = BCH_VALIDATE_commit, 348 }); 349 if (invalid) { 350 struct printbuf buf = PRINTBUF; 351 352 prt_str(&buf, "about to insert invalid key in data update path"); 353 prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); 354 prt_str(&buf, "\nold: "); 355 bch2_bkey_val_to_text(&buf, c, old); 356 prt_str(&buf, "\nk: "); 357 bch2_bkey_val_to_text(&buf, c, k); 358 prt_str(&buf, "\nnew: "); 359 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 360 361 bch2_print_string_as_lines(KERN_ERR, buf.buf); 362 printbuf_exit(&buf); 363 364 bch2_fatal_error(c); 365 ret = -BCH_ERR_invalid_bkey; 366 goto out; 367 } 368 369 if (trace_data_update_enabled()) { 370 struct printbuf buf = PRINTBUF; 371 372 prt_str(&buf, "\nold: "); 373 bch2_bkey_val_to_text(&buf, c, old); 374 prt_str(&buf, "\nk: "); 375 bch2_bkey_val_to_text(&buf, c, k); 376 prt_str(&buf, "\nnew: "); 377 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 378 379 trace_data_update(c, buf.buf); 380 printbuf_exit(&buf); 381 } 382 383 printbuf_reset(&journal_msg); 384 prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); 385 386 ret = bch2_trans_log_msg(trans, &journal_msg) ?: 387 bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: 388 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 389 k.k->p, bkey_start_pos(&insert->k)) ?: 390 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 391 k.k->p, insert->k.p) ?: 392 bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: 393 bch2_trans_update(trans, &iter, insert, 394 BTREE_UPDATE_internal_snapshot_node) ?: 395 bch2_trans_commit(trans, &op->res, 396 NULL, 397 BCH_TRANS_COMMIT_no_check_rw| 398 BCH_TRANS_COMMIT_no_enospc| 399 m->data_opts.btree_insert_flags); 400 if (!ret) { 401 bch2_btree_iter_set_pos(trans, &iter, next_pos); 402 403 this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); 404 if (trace_io_move_finish_enabled()) 405 trace_io_move_finish2(m, &new->k_i, insert); 406 } 407 err: 408 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 409 ret = 0; 410 if (ret) 411 break; 412 next: 413 while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { 414 bch2_keylist_pop_front(keys); 415 if (bch2_keylist_empty(keys)) 416 goto out; 417 } 418 continue; 419 nowork: 420 if (m->stats) { 421 BUG_ON(k.k->p.offset <= iter.pos.offset); 422 atomic64_inc(&m->stats->keys_raced); 423 atomic64_add(k.k->p.offset - iter.pos.offset, 424 &m->stats->sectors_raced); 425 } 426 427 count_event(c, io_move_fail); 428 429 bch2_btree_iter_advance(trans, &iter); 430 goto next; 431 } 432 out: 433 printbuf_exit(&journal_msg); 434 bch2_trans_iter_exit(trans, &iter); 435 bch2_bkey_buf_exit(&_insert, c); 436 bch2_bkey_buf_exit(&_new, c); 437 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 438 return ret; 439 } 440 441 int bch2_data_update_index_update(struct bch_write_op *op) 442 { 443 return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); 444 } 445 446 void bch2_data_update_read_done(struct data_update *m) 447 { 448 m->read_done = true; 449 450 /* write bio must own pages: */ 451 BUG_ON(!m->op.wbio.bio.bi_vcnt); 452 453 m->op.crc = m->rbio.pick.crc; 454 m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; 455 456 this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); 457 458 closure_call(&m->op.cl, bch2_write, NULL, NULL); 459 } 460 461 void bch2_data_update_exit(struct data_update *update) 462 { 463 struct bch_fs *c = update->op.c; 464 struct bkey_s_c k = bkey_i_to_s_c(update->k.k); 465 466 bch2_bio_free_pages_pool(c, &update->op.wbio.bio); 467 kfree(update->bvecs); 468 update->bvecs = NULL; 469 470 if (c->opts.nocow_enabled) 471 bkey_nocow_unlock(c, k); 472 bkey_put_dev_refs(c, k); 473 bch2_disk_reservation_put(c, &update->op.res); 474 bch2_bkey_buf_exit(&update->k, c); 475 } 476 477 static int bch2_update_unwritten_extent(struct btree_trans *trans, 478 struct data_update *update) 479 { 480 struct bch_fs *c = update->op.c; 481 struct bkey_i_extent *e; 482 struct write_point *wp; 483 struct closure cl; 484 struct btree_iter iter; 485 struct bkey_s_c k; 486 int ret = 0; 487 488 closure_init_stack(&cl); 489 bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); 490 491 while (bpos_lt(update->op.pos, update->k.k->k.p)) { 492 unsigned sectors = update->k.k->k.p.offset - 493 update->op.pos.offset; 494 495 bch2_trans_begin(trans); 496 497 bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, 498 BTREE_ITER_slots); 499 ret = lockrestart_do(trans, ({ 500 k = bch2_btree_iter_peek_slot(trans, &iter); 501 bkey_err(k); 502 })); 503 bch2_trans_iter_exit(trans, &iter); 504 505 if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) 506 break; 507 508 e = bkey_extent_init(update->op.insert_keys.top); 509 e->k.p = update->op.pos; 510 511 ret = bch2_alloc_sectors_start_trans(trans, 512 update->op.target, 513 false, 514 update->op.write_point, 515 &update->op.devs_have, 516 update->op.nr_replicas, 517 update->op.nr_replicas, 518 update->op.watermark, 519 0, &cl, &wp); 520 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { 521 bch2_trans_unlock(trans); 522 closure_sync(&cl); 523 continue; 524 } 525 526 bch_err_fn_ratelimited(c, ret); 527 528 if (ret) 529 break; 530 531 sectors = min(sectors, wp->sectors_free); 532 533 bch2_key_resize(&e->k, sectors); 534 535 bch2_open_bucket_get(c, wp, &update->op.open_buckets); 536 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); 537 bch2_alloc_sectors_done(c, wp); 538 539 update->op.pos.offset += sectors; 540 541 extent_for_each_ptr(extent_i_to_s(e), ptr) 542 ptr->unwritten = true; 543 bch2_keylist_push(&update->op.insert_keys); 544 545 ret = __bch2_data_update_index_update(trans, &update->op); 546 547 bch2_open_buckets_put(c, &update->op.open_buckets); 548 549 if (ret) 550 break; 551 } 552 553 if (closure_nr_remaining(&cl) != 1) { 554 bch2_trans_unlock(trans); 555 closure_sync(&cl); 556 } 557 558 return ret; 559 } 560 561 void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 562 struct bch_io_opts *io_opts, 563 struct data_update_opts *data_opts) 564 { 565 if (!out->nr_tabstops) 566 printbuf_tabstop_push(out, 20); 567 568 prt_str_indented(out, "rewrite ptrs:\t"); 569 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 570 prt_newline(out); 571 572 prt_str_indented(out, "kill ptrs:\t"); 573 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 574 prt_newline(out); 575 576 prt_str_indented(out, "target:\t"); 577 bch2_target_to_text(out, c, data_opts->target); 578 prt_newline(out); 579 580 prt_str_indented(out, "compression:\t"); 581 bch2_compression_opt_to_text(out, io_opts->background_compression); 582 prt_newline(out); 583 584 prt_str_indented(out, "opts.replicas:\t"); 585 prt_u64(out, io_opts->data_replicas); 586 prt_newline(out); 587 588 prt_str_indented(out, "extra replicas:\t"); 589 prt_u64(out, data_opts->extra_replicas); 590 } 591 592 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) 593 { 594 prt_str(out, bch2_data_update_type_strs[m->type]); 595 prt_newline(out); 596 597 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 598 prt_newline(out); 599 600 prt_str_indented(out, "old key:\t"); 601 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 602 } 603 604 void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) 605 { 606 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 607 prt_newline(out); 608 printbuf_indent_add(out, 2); 609 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 610 prt_printf(out, "read_done:\t%u\n", m->read_done); 611 bch2_write_op_to_text(out, &m->op); 612 printbuf_indent_sub(out, 2); 613 } 614 615 int bch2_extent_drop_ptrs(struct btree_trans *trans, 616 struct btree_iter *iter, 617 struct bkey_s_c k, 618 struct bch_io_opts *io_opts, 619 struct data_update_opts *data_opts) 620 { 621 struct bch_fs *c = trans->c; 622 struct bkey_i *n; 623 int ret; 624 625 n = bch2_bkey_make_mut_noupdate(trans, k); 626 ret = PTR_ERR_OR_ZERO(n); 627 if (ret) 628 return ret; 629 630 while (data_opts->kill_ptrs) { 631 unsigned i = 0, drop = __fls(data_opts->kill_ptrs); 632 633 bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); 634 data_opts->kill_ptrs ^= 1U << drop; 635 } 636 637 /* 638 * If the new extent no longer has any pointers, bch2_extent_normalize() 639 * will do the appropriate thing with it (turning it into a 640 * KEY_TYPE_error key, or just a discard if it was a cached extent) 641 */ 642 bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); 643 644 /* 645 * Since we're not inserting through an extent iterator 646 * (BTREE_ITER_all_snapshots iterators aren't extent iterators), 647 * we aren't using the extent overwrite path to delete, we're 648 * just using the normal key deletion path: 649 */ 650 if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) 651 n->k.size = 0; 652 653 return bch2_trans_relock(trans) ?: 654 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: 655 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 656 } 657 658 int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, 659 struct bch_io_opts *io_opts) 660 { 661 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); 662 const union bch_extent_entry *entry; 663 struct extent_ptr_decoded p; 664 665 /* write path might have to decompress data: */ 666 unsigned buf_bytes = 0; 667 bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) 668 buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); 669 670 unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); 671 672 m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); 673 if (!m->bvecs) 674 return -ENOMEM; 675 676 bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); 677 bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); 678 679 if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { 680 kfree(m->bvecs); 681 m->bvecs = NULL; 682 return -ENOMEM; 683 } 684 685 rbio_init(&m->rbio.bio, c, *io_opts, NULL); 686 m->rbio.data_update = true; 687 m->rbio.bio.bi_iter.bi_size = buf_bytes; 688 m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); 689 m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); 690 return 0; 691 } 692 693 static int can_write_extent(struct bch_fs *c, struct data_update *m) 694 { 695 if ((m->op.flags & BCH_WRITE_alloc_nowait) && 696 unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) 697 return -BCH_ERR_data_update_done_would_block; 698 699 unsigned target = m->op.flags & BCH_WRITE_only_specified_devs 700 ? m->op.target 701 : 0; 702 struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); 703 704 darray_for_each(m->op.devs_have, i) 705 __clear_bit(*i, devs.d); 706 707 rcu_read_lock(); 708 unsigned nr_replicas = 0, i; 709 for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { 710 struct bch_dev *ca = bch2_dev_rcu(c, i); 711 712 struct bch_dev_usage usage; 713 bch2_dev_usage_read_fast(ca, &usage); 714 715 if (!dev_buckets_free(ca, usage, m->op.watermark)) 716 continue; 717 718 nr_replicas += ca->mi.durability; 719 if (nr_replicas >= m->op.nr_replicas) 720 break; 721 } 722 rcu_read_unlock(); 723 724 if (!nr_replicas) 725 return -BCH_ERR_data_update_done_no_rw_devs; 726 if (nr_replicas < m->op.nr_replicas) 727 return -BCH_ERR_insufficient_devices; 728 return 0; 729 } 730 731 int bch2_data_update_init(struct btree_trans *trans, 732 struct btree_iter *iter, 733 struct moving_context *ctxt, 734 struct data_update *m, 735 struct write_point_specifier wp, 736 struct bch_io_opts *io_opts, 737 struct data_update_opts data_opts, 738 enum btree_id btree_id, 739 struct bkey_s_c k) 740 { 741 struct bch_fs *c = trans->c; 742 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 743 const union bch_extent_entry *entry; 744 struct extent_ptr_decoded p; 745 unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; 746 int ret = 0; 747 748 /* 749 * fs is corrupt we have a key for a snapshot node that doesn't exist, 750 * and we have to check for this because we go rw before repairing the 751 * snapshots table - just skip it, we can move it later. 752 */ 753 if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) 754 return -BCH_ERR_data_update_done_no_snapshot; 755 756 bch2_bkey_buf_init(&m->k); 757 bch2_bkey_buf_reassemble(&m->k, c, k); 758 m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc 759 ? BCH_DATA_UPDATE_copygc 760 : BCH_DATA_UPDATE_rebalance; 761 m->btree_id = btree_id; 762 m->data_opts = data_opts; 763 m->ctxt = ctxt; 764 m->stats = ctxt ? ctxt->stats : NULL; 765 766 bch2_write_op_init(&m->op, c, *io_opts); 767 m->op.pos = bkey_start_pos(k.k); 768 m->op.version = k.k->bversion; 769 m->op.target = data_opts.target; 770 m->op.write_point = wp; 771 m->op.nr_replicas = 0; 772 m->op.flags |= BCH_WRITE_pages_stable| 773 BCH_WRITE_pages_owned| 774 BCH_WRITE_data_encoded| 775 BCH_WRITE_move| 776 m->data_opts.write_flags; 777 m->op.compression_opt = io_opts->background_compression; 778 m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; 779 780 unsigned durability_have = 0, durability_removing = 0; 781 782 unsigned ptr_bit = 1; 783 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 784 if (!p.ptr.cached) { 785 rcu_read_lock(); 786 if (ptr_bit & m->data_opts.rewrite_ptrs) { 787 if (crc_is_compressed(p.crc)) 788 reserve_sectors += k.k->size; 789 790 m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); 791 durability_removing += bch2_extent_ptr_desired_durability(c, &p); 792 } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { 793 bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); 794 durability_have += bch2_extent_ptr_durability(c, &p); 795 } 796 rcu_read_unlock(); 797 } 798 799 /* 800 * op->csum_type is normally initialized from the fs/file's 801 * current options - but if an extent is encrypted, we require 802 * that it stays encrypted: 803 */ 804 if (bch2_csum_type_is_encryption(p.crc.csum_type)) { 805 m->op.nonce = p.crc.nonce + p.crc.offset; 806 m->op.csum_type = p.crc.csum_type; 807 } 808 809 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) 810 m->op.incompressible = true; 811 812 ptr_bit <<= 1; 813 } 814 815 unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); 816 817 /* 818 * If current extent durability is less than io_opts.data_replicas, 819 * we're not trying to rereplicate the extent up to data_replicas here - 820 * unless extra_replicas was specified 821 * 822 * Increasing replication is an explicit operation triggered by 823 * rereplicate, currently, so that users don't get an unexpected -ENOSPC 824 */ 825 m->op.nr_replicas = min(durability_removing, durability_required) + 826 m->data_opts.extra_replicas; 827 828 /* 829 * If device(s) were set to durability=0 after data was written to them 830 * we can end up with a duribilty=0 extent, and the normal algorithm 831 * that tries not to increase durability doesn't work: 832 */ 833 if (!(durability_have + durability_removing)) 834 m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); 835 836 m->op.nr_replicas_required = m->op.nr_replicas; 837 838 /* 839 * It might turn out that we don't need any new replicas, if the 840 * replicas or durability settings have been changed since the extent 841 * was written: 842 */ 843 if (!m->op.nr_replicas) { 844 m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; 845 m->data_opts.rewrite_ptrs = 0; 846 /* if iter == NULL, it's just a promote */ 847 if (iter) 848 ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); 849 if (!ret) 850 ret = -BCH_ERR_data_update_done_no_writes_needed; 851 goto out_bkey_buf_exit; 852 } 853 854 /* 855 * Check if the allocation will succeed, to avoid getting an error later 856 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless 857 * read: 858 * 859 * This guards against 860 * - BCH_WRITE_alloc_nowait allocations failing (promotes) 861 * - Destination target full 862 * - Device(s) in destination target offline 863 * - Insufficient durability available in destination target 864 * (i.e. trying to move a durability=2 replica to a target with a 865 * single durability=2 device) 866 */ 867 ret = can_write_extent(c, m); 868 if (ret) 869 goto out_bkey_buf_exit; 870 871 if (reserve_sectors) { 872 ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, 873 m->data_opts.extra_replicas 874 ? 0 875 : BCH_DISK_RESERVATION_NOFAIL); 876 if (ret) 877 goto out_bkey_buf_exit; 878 } 879 880 if (!bkey_get_dev_refs(c, k)) { 881 ret = -BCH_ERR_data_update_done_no_dev_refs; 882 goto out_put_disk_res; 883 } 884 885 if (c->opts.nocow_enabled && 886 !bkey_nocow_lock(c, ctxt, k)) { 887 ret = -BCH_ERR_nocow_lock_blocked; 888 goto out_put_dev_refs; 889 } 890 891 if (bkey_extent_is_unwritten(k)) { 892 ret = bch2_update_unwritten_extent(trans, m) ?: 893 -BCH_ERR_data_update_done_unwritten; 894 goto out_nocow_unlock; 895 } 896 897 ret = bch2_data_update_bios_init(m, c, io_opts); 898 if (ret) 899 goto out_nocow_unlock; 900 901 return 0; 902 out_nocow_unlock: 903 if (c->opts.nocow_enabled) 904 bkey_nocow_unlock(c, k); 905 out_put_dev_refs: 906 bkey_put_dev_refs(c, k); 907 out_put_disk_res: 908 bch2_disk_reservation_put(c, &m->op.res); 909 out_bkey_buf_exit: 910 bch2_bkey_buf_exit(&m->k, c); 911 return ret; 912 } 913 914 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) 915 { 916 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 917 unsigned ptr_bit = 1; 918 919 bkey_for_each_ptr(ptrs, ptr) { 920 if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { 921 opts->kill_ptrs |= ptr_bit; 922 opts->rewrite_ptrs ^= ptr_bit; 923 } 924 925 ptr_bit <<= 1; 926 } 927 } 928