1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "backref.h" 17 #include "compression.h" 18 #include "qgroup.h" 19 #include "block-group.h" 20 #include "space-info.h" 21 #include "inode-item.h" 22 #include "fs.h" 23 #include "accessors.h" 24 #include "extent-tree.h" 25 #include "root-tree.h" 26 #include "dir-item.h" 27 #include "file-item.h" 28 #include "file.h" 29 #include "orphan.h" 30 #include "tree-checker.h" 31 32 #define MAX_CONFLICT_INODES 10 33 34 /* magic values for the inode_only field in btrfs_log_inode: 35 * 36 * LOG_INODE_ALL means to log everything 37 * LOG_INODE_EXISTS means to log just enough to recreate the inode 38 * during log replay 39 */ 40 enum { 41 LOG_INODE_ALL, 42 LOG_INODE_EXISTS, 43 }; 44 45 /* 46 * directory trouble cases 47 * 48 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 49 * log, we must force a full commit before doing an fsync of the directory 50 * where the unlink was done. 51 * ---> record transid of last unlink/rename per directory 52 * 53 * mkdir foo/some_dir 54 * normal commit 55 * rename foo/some_dir foo2/some_dir 56 * mkdir foo/some_dir 57 * fsync foo/some_dir/some_file 58 * 59 * The fsync above will unlink the original some_dir without recording 60 * it in its new location (foo2). After a crash, some_dir will be gone 61 * unless the fsync of some_file forces a full commit 62 * 63 * 2) we must log any new names for any file or dir that is in the fsync 64 * log. ---> check inode while renaming/linking. 65 * 66 * 2a) we must log any new names for any file or dir during rename 67 * when the directory they are being removed from was logged. 68 * ---> check inode and old parent dir during rename 69 * 70 * 2a is actually the more important variant. With the extra logging 71 * a crash might unlink the old name without recreating the new one 72 * 73 * 3) after a crash, we must go through any directories with a link count 74 * of zero and redo the rm -rf 75 * 76 * mkdir f1/foo 77 * normal commit 78 * rm -rf f1/foo 79 * fsync(f1) 80 * 81 * The directory f1 was fully removed from the FS, but fsync was never 82 * called on f1, only its parent dir. After a crash the rm -rf must 83 * be replayed. This must be able to recurse down the entire 84 * directory tree. The inode link count fixup code takes care of the 85 * ugly details. 86 */ 87 88 /* 89 * stages for the tree walking. The first 90 * stage (0) is to only pin down the blocks we find 91 * the second stage (1) is to make sure that all the inodes 92 * we find in the log are created in the subvolume. 93 * 94 * The last stage is to deal with directories and links and extents 95 * and all the other fun semantics 96 */ 97 enum { 98 LOG_WALK_PIN_ONLY, 99 LOG_WALK_REPLAY_INODES, 100 LOG_WALK_REPLAY_DIR_INDEX, 101 LOG_WALK_REPLAY_ALL, 102 }; 103 104 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 105 struct btrfs_inode *inode, 106 int inode_only, 107 struct btrfs_log_ctx *ctx); 108 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 109 struct btrfs_root *root, 110 struct btrfs_path *path, u64 objectid); 111 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 112 struct btrfs_root *root, 113 struct btrfs_root *log, 114 struct btrfs_path *path, 115 u64 dirid, int del_all); 116 static void wait_log_commit(struct btrfs_root *root, int transid); 117 118 /* 119 * tree logging is a special write ahead log used to make sure that 120 * fsyncs and O_SYNCs can happen without doing full tree commits. 121 * 122 * Full tree commits are expensive because they require commonly 123 * modified blocks to be recowed, creating many dirty pages in the 124 * extent tree an 4x-6x higher write load than ext3. 125 * 126 * Instead of doing a tree commit on every fsync, we use the 127 * key ranges and transaction ids to find items for a given file or directory 128 * that have changed in this transaction. Those items are copied into 129 * a special tree (one per subvolume root), that tree is written to disk 130 * and then the fsync is considered complete. 131 * 132 * After a crash, items are copied out of the log-tree back into the 133 * subvolume tree. Any file data extents found are recorded in the extent 134 * allocation tree, and the log-tree freed. 135 * 136 * The log tree is read three times, once to pin down all the extents it is 137 * using in ram and once, once to create all the inodes logged in the tree 138 * and once to do all the other items. 139 */ 140 141 static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) 142 { 143 unsigned int nofs_flag; 144 struct btrfs_inode *inode; 145 146 /* 147 * We're holding a transaction handle whether we are logging or 148 * replaying a log tree, so we must make sure NOFS semantics apply 149 * because btrfs_alloc_inode() may be triggered and it uses GFP_KERNEL 150 * to allocate an inode, which can recurse back into the filesystem and 151 * attempt a transaction commit, resulting in a deadlock. 152 */ 153 nofs_flag = memalloc_nofs_save(); 154 inode = btrfs_iget(objectid, root); 155 memalloc_nofs_restore(nofs_flag); 156 157 return inode; 158 } 159 160 /* 161 * start a sub transaction and setup the log tree 162 * this increments the log tree writer count to make the people 163 * syncing the tree wait for us to finish 164 */ 165 static int start_log_trans(struct btrfs_trans_handle *trans, 166 struct btrfs_root *root, 167 struct btrfs_log_ctx *ctx) 168 { 169 struct btrfs_fs_info *fs_info = root->fs_info; 170 struct btrfs_root *tree_root = fs_info->tree_root; 171 const bool zoned = btrfs_is_zoned(fs_info); 172 int ret = 0; 173 bool created = false; 174 175 /* 176 * First check if the log root tree was already created. If not, create 177 * it before locking the root's log_mutex, just to keep lockdep happy. 178 */ 179 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 180 mutex_lock(&tree_root->log_mutex); 181 if (!fs_info->log_root_tree) { 182 ret = btrfs_init_log_root_tree(trans, fs_info); 183 if (!ret) { 184 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 185 created = true; 186 } 187 } 188 mutex_unlock(&tree_root->log_mutex); 189 if (ret) 190 return ret; 191 } 192 193 mutex_lock(&root->log_mutex); 194 195 again: 196 if (root->log_root) { 197 int index = (root->log_transid + 1) % 2; 198 199 if (btrfs_need_log_full_commit(trans)) { 200 ret = BTRFS_LOG_FORCE_COMMIT; 201 goto out; 202 } 203 204 if (zoned && atomic_read(&root->log_commit[index])) { 205 wait_log_commit(root, root->log_transid - 1); 206 goto again; 207 } 208 209 if (!root->log_start_pid) { 210 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 211 root->log_start_pid = current->pid; 212 } else if (root->log_start_pid != current->pid) { 213 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 214 } 215 } else { 216 /* 217 * This means fs_info->log_root_tree was already created 218 * for some other FS trees. Do the full commit not to mix 219 * nodes from multiple log transactions to do sequential 220 * writing. 221 */ 222 if (zoned && !created) { 223 ret = BTRFS_LOG_FORCE_COMMIT; 224 goto out; 225 } 226 227 ret = btrfs_add_log_tree(trans, root); 228 if (ret) 229 goto out; 230 231 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 232 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 233 root->log_start_pid = current->pid; 234 } 235 236 atomic_inc(&root->log_writers); 237 if (!ctx->logging_new_name) { 238 int index = root->log_transid % 2; 239 list_add_tail(&ctx->list, &root->log_ctxs[index]); 240 ctx->log_transid = root->log_transid; 241 } 242 243 out: 244 mutex_unlock(&root->log_mutex); 245 return ret; 246 } 247 248 /* 249 * returns 0 if there was a log transaction running and we were able 250 * to join, or returns -ENOENT if there were not transactions 251 * in progress 252 */ 253 static int join_running_log_trans(struct btrfs_root *root) 254 { 255 const bool zoned = btrfs_is_zoned(root->fs_info); 256 int ret = -ENOENT; 257 258 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 259 return ret; 260 261 mutex_lock(&root->log_mutex); 262 again: 263 if (root->log_root) { 264 int index = (root->log_transid + 1) % 2; 265 266 ret = 0; 267 if (zoned && atomic_read(&root->log_commit[index])) { 268 wait_log_commit(root, root->log_transid - 1); 269 goto again; 270 } 271 atomic_inc(&root->log_writers); 272 } 273 mutex_unlock(&root->log_mutex); 274 return ret; 275 } 276 277 /* 278 * This either makes the current running log transaction wait 279 * until you call btrfs_end_log_trans() or it makes any future 280 * log transactions wait until you call btrfs_end_log_trans() 281 */ 282 void btrfs_pin_log_trans(struct btrfs_root *root) 283 { 284 atomic_inc(&root->log_writers); 285 } 286 287 /* 288 * indicate we're done making changes to the log tree 289 * and wake up anyone waiting to do a sync 290 */ 291 void btrfs_end_log_trans(struct btrfs_root *root) 292 { 293 if (atomic_dec_and_test(&root->log_writers)) { 294 /* atomic_dec_and_test implies a barrier */ 295 cond_wake_up_nomb(&root->log_writer_wait); 296 } 297 } 298 299 /* 300 * the walk control struct is used to pass state down the chain when 301 * processing the log tree. The stage field tells us which part 302 * of the log tree processing we are currently doing. The others 303 * are state fields used for that specific part 304 */ 305 struct walk_control { 306 /* should we free the extent on disk when done? This is used 307 * at transaction commit time while freeing a log tree 308 */ 309 int free; 310 311 /* pin only walk, we record which extents on disk belong to the 312 * log trees 313 */ 314 int pin; 315 316 /* what stage of the replay code we're currently in */ 317 int stage; 318 319 /* 320 * Ignore any items from the inode currently being processed. Needs 321 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 322 * the LOG_WALK_REPLAY_INODES stage. 323 */ 324 bool ignore_cur_inode; 325 326 /* the root we are currently replaying */ 327 struct btrfs_root *replay_dest; 328 329 /* the trans handle for the current replay */ 330 struct btrfs_trans_handle *trans; 331 332 /* the function that gets used to process blocks we find in the 333 * tree. Note the extent_buffer might not be up to date when it is 334 * passed in, and it must be checked or read if you need the data 335 * inside it 336 */ 337 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 338 struct walk_control *wc, u64 gen, int level); 339 }; 340 341 /* 342 * process_func used to pin down extents, write them or wait on them 343 */ 344 static int process_one_buffer(struct btrfs_root *log, 345 struct extent_buffer *eb, 346 struct walk_control *wc, u64 gen, int level) 347 { 348 struct btrfs_fs_info *fs_info = log->fs_info; 349 int ret = 0; 350 351 /* 352 * If this fs is mixed then we need to be able to process the leaves to 353 * pin down any logged extents, so we have to read the block. 354 */ 355 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 356 struct btrfs_tree_parent_check check = { 357 .level = level, 358 .transid = gen 359 }; 360 361 ret = btrfs_read_extent_buffer(eb, &check); 362 if (ret) 363 return ret; 364 } 365 366 if (wc->pin) { 367 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); 368 if (ret) 369 return ret; 370 371 if (btrfs_buffer_uptodate(eb, gen, 0) && 372 btrfs_header_level(eb) == 0) 373 ret = btrfs_exclude_logged_extents(eb); 374 } 375 return ret; 376 } 377 378 /* 379 * Item overwrite used by log replay. The given eb, slot and key all refer to 380 * the source data we are copying out. 381 * 382 * The given root is for the tree we are copying into, and path is a scratch 383 * path for use in this function (it should be released on entry and will be 384 * released on exit). 385 * 386 * If the key is already in the destination tree the existing item is 387 * overwritten. If the existing item isn't big enough, it is extended. 388 * If it is too large, it is truncated. 389 * 390 * If the key isn't in the destination yet, a new item is inserted. 391 */ 392 static int overwrite_item(struct btrfs_trans_handle *trans, 393 struct btrfs_root *root, 394 struct btrfs_path *path, 395 struct extent_buffer *eb, int slot, 396 struct btrfs_key *key) 397 { 398 int ret; 399 u32 item_size; 400 u64 saved_i_size = 0; 401 int save_old_i_size = 0; 402 unsigned long src_ptr; 403 unsigned long dst_ptr; 404 struct extent_buffer *dst_eb; 405 int dst_slot; 406 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 407 408 /* 409 * This is only used during log replay, so the root is always from a 410 * fs/subvolume tree. In case we ever need to support a log root, then 411 * we'll have to clone the leaf in the path, release the path and use 412 * the leaf before writing into the log tree. See the comments at 413 * copy_items() for more details. 414 */ 415 ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); 416 417 item_size = btrfs_item_size(eb, slot); 418 src_ptr = btrfs_item_ptr_offset(eb, slot); 419 420 /* Look for the key in the destination tree. */ 421 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 422 if (ret < 0) 423 return ret; 424 425 dst_eb = path->nodes[0]; 426 dst_slot = path->slots[0]; 427 428 if (ret == 0) { 429 char *src_copy; 430 const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); 431 432 if (dst_size != item_size) 433 goto insert; 434 435 if (item_size == 0) { 436 btrfs_release_path(path); 437 return 0; 438 } 439 src_copy = kmalloc(item_size, GFP_NOFS); 440 if (!src_copy) { 441 btrfs_release_path(path); 442 return -ENOMEM; 443 } 444 445 read_extent_buffer(eb, src_copy, src_ptr, item_size); 446 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); 447 ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); 448 449 kfree(src_copy); 450 /* 451 * they have the same contents, just return, this saves 452 * us from cowing blocks in the destination tree and doing 453 * extra writes that may not have been done by a previous 454 * sync 455 */ 456 if (ret == 0) { 457 btrfs_release_path(path); 458 return 0; 459 } 460 461 /* 462 * We need to load the old nbytes into the inode so when we 463 * replay the extents we've logged we get the right nbytes. 464 */ 465 if (inode_item) { 466 struct btrfs_inode_item *item; 467 u64 nbytes; 468 u32 mode; 469 470 item = btrfs_item_ptr(dst_eb, dst_slot, 471 struct btrfs_inode_item); 472 nbytes = btrfs_inode_nbytes(dst_eb, item); 473 item = btrfs_item_ptr(eb, slot, 474 struct btrfs_inode_item); 475 btrfs_set_inode_nbytes(eb, item, nbytes); 476 477 /* 478 * If this is a directory we need to reset the i_size to 479 * 0 so that we can set it up properly when replaying 480 * the rest of the items in this log. 481 */ 482 mode = btrfs_inode_mode(eb, item); 483 if (S_ISDIR(mode)) 484 btrfs_set_inode_size(eb, item, 0); 485 } 486 } else if (inode_item) { 487 struct btrfs_inode_item *item; 488 u32 mode; 489 490 /* 491 * New inode, set nbytes to 0 so that the nbytes comes out 492 * properly when we replay the extents. 493 */ 494 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 495 btrfs_set_inode_nbytes(eb, item, 0); 496 497 /* 498 * If this is a directory we need to reset the i_size to 0 so 499 * that we can set it up properly when replaying the rest of 500 * the items in this log. 501 */ 502 mode = btrfs_inode_mode(eb, item); 503 if (S_ISDIR(mode)) 504 btrfs_set_inode_size(eb, item, 0); 505 } 506 insert: 507 btrfs_release_path(path); 508 /* try to insert the key into the destination tree */ 509 path->skip_release_on_error = 1; 510 ret = btrfs_insert_empty_item(trans, root, path, 511 key, item_size); 512 path->skip_release_on_error = 0; 513 514 dst_eb = path->nodes[0]; 515 dst_slot = path->slots[0]; 516 517 /* make sure any existing item is the correct size */ 518 if (ret == -EEXIST || ret == -EOVERFLOW) { 519 const u32 found_size = btrfs_item_size(dst_eb, dst_slot); 520 521 if (found_size > item_size) 522 btrfs_truncate_item(trans, path, item_size, 1); 523 else if (found_size < item_size) 524 btrfs_extend_item(trans, path, item_size - found_size); 525 } else if (ret) { 526 return ret; 527 } 528 dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); 529 530 /* don't overwrite an existing inode if the generation number 531 * was logged as zero. This is done when the tree logging code 532 * is just logging an inode to make sure it exists after recovery. 533 * 534 * Also, don't overwrite i_size on directories during replay. 535 * log replay inserts and removes directory items based on the 536 * state of the tree found in the subvolume, and i_size is modified 537 * as it goes 538 */ 539 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 540 struct btrfs_inode_item *src_item; 541 struct btrfs_inode_item *dst_item; 542 543 src_item = (struct btrfs_inode_item *)src_ptr; 544 dst_item = (struct btrfs_inode_item *)dst_ptr; 545 546 if (btrfs_inode_generation(eb, src_item) == 0) { 547 const u64 ino_size = btrfs_inode_size(eb, src_item); 548 549 /* 550 * For regular files an ino_size == 0 is used only when 551 * logging that an inode exists, as part of a directory 552 * fsync, and the inode wasn't fsynced before. In this 553 * case don't set the size of the inode in the fs/subvol 554 * tree, otherwise we would be throwing valid data away. 555 */ 556 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 557 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 558 ino_size != 0) 559 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 560 goto no_copy; 561 } 562 563 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && 564 S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { 565 save_old_i_size = 1; 566 saved_i_size = btrfs_inode_size(dst_eb, dst_item); 567 } 568 } 569 570 copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); 571 572 if (save_old_i_size) { 573 struct btrfs_inode_item *dst_item; 574 575 dst_item = (struct btrfs_inode_item *)dst_ptr; 576 btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); 577 } 578 579 /* make sure the generation is filled in */ 580 if (key->type == BTRFS_INODE_ITEM_KEY) { 581 struct btrfs_inode_item *dst_item; 582 583 dst_item = (struct btrfs_inode_item *)dst_ptr; 584 if (btrfs_inode_generation(dst_eb, dst_item) == 0) 585 btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); 586 } 587 no_copy: 588 btrfs_release_path(path); 589 return 0; 590 } 591 592 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, 593 struct fscrypt_str *name) 594 { 595 char *buf; 596 597 buf = kmalloc(len, GFP_NOFS); 598 if (!buf) 599 return -ENOMEM; 600 601 read_extent_buffer(eb, buf, (unsigned long)start, len); 602 name->name = buf; 603 name->len = len; 604 return 0; 605 } 606 607 /* 608 * simple helper to read an inode off the disk from a given root 609 * This can only be called for subvolume roots and not for the log 610 */ 611 static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, 612 u64 objectid) 613 { 614 struct btrfs_inode *inode; 615 616 inode = btrfs_iget_logging(objectid, root); 617 if (IS_ERR(inode)) 618 return NULL; 619 return inode; 620 } 621 622 /* replays a single extent in 'eb' at 'slot' with 'key' into the 623 * subvolume 'root'. path is released on entry and should be released 624 * on exit. 625 * 626 * extents in the log tree have not been allocated out of the extent 627 * tree yet. So, this completes the allocation, taking a reference 628 * as required if the extent already exists or creating a new extent 629 * if it isn't in the extent allocation tree yet. 630 * 631 * The extent is inserted into the file, dropping any existing extents 632 * from the file that overlap the new one. 633 */ 634 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 635 struct btrfs_root *root, 636 struct btrfs_path *path, 637 struct extent_buffer *eb, int slot, 638 struct btrfs_key *key) 639 { 640 struct btrfs_drop_extents_args drop_args = { 0 }; 641 struct btrfs_fs_info *fs_info = root->fs_info; 642 int found_type; 643 u64 extent_end; 644 u64 start = key->offset; 645 u64 nbytes = 0; 646 struct btrfs_file_extent_item *item; 647 struct btrfs_inode *inode = NULL; 648 unsigned long size; 649 int ret = 0; 650 651 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 652 found_type = btrfs_file_extent_type(eb, item); 653 654 if (found_type == BTRFS_FILE_EXTENT_REG || 655 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 656 nbytes = btrfs_file_extent_num_bytes(eb, item); 657 extent_end = start + nbytes; 658 659 /* 660 * We don't add to the inodes nbytes if we are prealloc or a 661 * hole. 662 */ 663 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 664 nbytes = 0; 665 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 666 size = btrfs_file_extent_ram_bytes(eb, item); 667 nbytes = btrfs_file_extent_ram_bytes(eb, item); 668 extent_end = ALIGN(start + size, 669 fs_info->sectorsize); 670 } else { 671 btrfs_err(fs_info, 672 "unexpected extent type=%d root=%llu inode=%llu offset=%llu", 673 found_type, btrfs_root_id(root), key->objectid, key->offset); 674 return -EUCLEAN; 675 } 676 677 inode = read_one_inode(root, key->objectid); 678 if (!inode) 679 return -EIO; 680 681 /* 682 * first check to see if we already have this extent in the 683 * file. This must be done before the btrfs_drop_extents run 684 * so we don't try to drop this extent. 685 */ 686 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); 687 688 if (ret == 0 && 689 (found_type == BTRFS_FILE_EXTENT_REG || 690 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 691 struct btrfs_file_extent_item existing; 692 unsigned long ptr; 693 694 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 695 read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, 702 sizeof(existing)) == 0) { 703 btrfs_release_path(path); 704 goto out; 705 } 706 } 707 btrfs_release_path(path); 708 709 /* drop any overlapping extents */ 710 drop_args.start = start; 711 drop_args.end = extent_end; 712 drop_args.drop_cache = true; 713 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 714 if (ret) 715 goto out; 716 717 if (found_type == BTRFS_FILE_EXTENT_REG || 718 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 719 u64 offset; 720 unsigned long dest_offset; 721 struct btrfs_key ins; 722 723 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 724 btrfs_fs_incompat(fs_info, NO_HOLES)) 725 goto update_inode; 726 727 ret = btrfs_insert_empty_item(trans, root, path, key, 728 sizeof(*item)); 729 if (ret) 730 goto out; 731 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 732 path->slots[0]); 733 copy_extent_buffer(path->nodes[0], eb, dest_offset, 734 (unsigned long)item, sizeof(*item)); 735 736 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 739 offset = key->offset - btrfs_file_extent_offset(eb, item); 740 741 /* 742 * Manually record dirty extent, as here we did a shallow 743 * file extent item copy and skip normal backref update, 744 * but modifying extent tree all by ourselves. 745 * So need to manually record dirty extent for qgroup, 746 * as the owner of the file extent changed from log tree 747 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 748 */ 749 ret = btrfs_qgroup_trace_extent(trans, 750 btrfs_file_extent_disk_bytenr(eb, item), 751 btrfs_file_extent_disk_num_bytes(eb, item)); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 u64 csum_start; 757 u64 csum_end; 758 LIST_HEAD(ordered_sums); 759 760 /* 761 * is this extent already allocated in the extent 762 * allocation tree? If so, just add a reference 763 */ 764 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 765 ins.offset); 766 if (ret < 0) { 767 goto out; 768 } else if (ret == 0) { 769 struct btrfs_ref ref = { 770 .action = BTRFS_ADD_DELAYED_REF, 771 .bytenr = ins.objectid, 772 .num_bytes = ins.offset, 773 .owning_root = btrfs_root_id(root), 774 .ref_root = btrfs_root_id(root), 775 }; 776 btrfs_init_data_ref(&ref, key->objectid, offset, 777 0, false); 778 ret = btrfs_inc_extent_ref(trans, &ref); 779 if (ret) 780 goto out; 781 } else { 782 /* 783 * insert the extent pointer in the extent 784 * allocation tree 785 */ 786 ret = btrfs_alloc_logged_file_extent(trans, 787 btrfs_root_id(root), 788 key->objectid, offset, &ins); 789 if (ret) 790 goto out; 791 } 792 btrfs_release_path(path); 793 794 if (btrfs_file_extent_compression(eb, item)) { 795 csum_start = ins.objectid; 796 csum_end = csum_start + ins.offset; 797 } else { 798 csum_start = ins.objectid + 799 btrfs_file_extent_offset(eb, item); 800 csum_end = csum_start + 801 btrfs_file_extent_num_bytes(eb, item); 802 } 803 804 ret = btrfs_lookup_csums_list(root->log_root, 805 csum_start, csum_end - 1, 806 &ordered_sums, false); 807 if (ret < 0) 808 goto out; 809 ret = 0; 810 /* 811 * Now delete all existing cums in the csum root that 812 * cover our range. We do this because we can have an 813 * extent that is completely referenced by one file 814 * extent item and partially referenced by another 815 * file extent item (like after using the clone or 816 * extent_same ioctls). In this case if we end up doing 817 * the replay of the one that partially references the 818 * extent first, and we do not do the csum deletion 819 * below, we can get 2 csum items in the csum tree that 820 * overlap each other. For example, imagine our log has 821 * the two following file extent items: 822 * 823 * key (257 EXTENT_DATA 409600) 824 * extent data disk byte 12845056 nr 102400 825 * extent data offset 20480 nr 20480 ram 102400 826 * 827 * key (257 EXTENT_DATA 819200) 828 * extent data disk byte 12845056 nr 102400 829 * extent data offset 0 nr 102400 ram 102400 830 * 831 * Where the second one fully references the 100K extent 832 * that starts at disk byte 12845056, and the log tree 833 * has a single csum item that covers the entire range 834 * of the extent: 835 * 836 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 837 * 838 * After the first file extent item is replayed, the 839 * csum tree gets the following csum item: 840 * 841 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 842 * 843 * Which covers the 20K sub-range starting at offset 20K 844 * of our extent. Now when we replay the second file 845 * extent item, if we do not delete existing csum items 846 * that cover any of its blocks, we end up getting two 847 * csum items in our csum tree that overlap each other: 848 * 849 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 850 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 851 * 852 * Which is a problem, because after this anyone trying 853 * to lookup up for the checksum of any block of our 854 * extent starting at an offset of 40K or higher, will 855 * end up looking at the second csum item only, which 856 * does not contain the checksum for any block starting 857 * at offset 40K or higher of our extent. 858 */ 859 while (!list_empty(&ordered_sums)) { 860 struct btrfs_ordered_sum *sums; 861 struct btrfs_root *csum_root; 862 863 sums = list_first_entry(&ordered_sums, 864 struct btrfs_ordered_sum, 865 list); 866 csum_root = btrfs_csum_root(fs_info, 867 sums->logical); 868 if (!ret) 869 ret = btrfs_del_csums(trans, csum_root, 870 sums->logical, 871 sums->len); 872 if (!ret) 873 ret = btrfs_csum_file_blocks(trans, 874 csum_root, 875 sums); 876 list_del(&sums->list); 877 kfree(sums); 878 } 879 if (ret) 880 goto out; 881 } else { 882 btrfs_release_path(path); 883 } 884 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 885 /* inline extents are easy, we just overwrite them */ 886 ret = overwrite_item(trans, root, path, eb, slot, key); 887 if (ret) 888 goto out; 889 } 890 891 ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); 892 if (ret) 893 goto out; 894 895 update_inode: 896 btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); 897 ret = btrfs_update_inode(trans, inode); 898 out: 899 iput(&inode->vfs_inode); 900 return ret; 901 } 902 903 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 904 struct btrfs_inode *dir, 905 struct btrfs_inode *inode, 906 const struct fscrypt_str *name) 907 { 908 int ret; 909 910 ret = btrfs_unlink_inode(trans, dir, inode, name); 911 if (ret) 912 return ret; 913 /* 914 * Whenever we need to check if a name exists or not, we check the 915 * fs/subvolume tree. So after an unlink we must run delayed items, so 916 * that future checks for a name during log replay see that the name 917 * does not exists anymore. 918 */ 919 return btrfs_run_delayed_items(trans); 920 } 921 922 /* 923 * when cleaning up conflicts between the directory names in the 924 * subvolume, directory names in the log and directory names in the 925 * inode back references, we may have to unlink inodes from directories. 926 * 927 * This is a helper function to do the unlink of a specific directory 928 * item 929 */ 930 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 931 struct btrfs_path *path, 932 struct btrfs_inode *dir, 933 struct btrfs_dir_item *di) 934 { 935 struct btrfs_root *root = dir->root; 936 struct btrfs_inode *inode; 937 struct fscrypt_str name; 938 struct extent_buffer *leaf; 939 struct btrfs_key location; 940 int ret; 941 942 leaf = path->nodes[0]; 943 944 btrfs_dir_item_key_to_cpu(leaf, di, &location); 945 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); 946 if (ret) 947 return -ENOMEM; 948 949 btrfs_release_path(path); 950 951 inode = read_one_inode(root, location.objectid); 952 if (!inode) { 953 ret = -EIO; 954 goto out; 955 } 956 957 ret = link_to_fixup_dir(trans, root, path, location.objectid); 958 if (ret) 959 goto out; 960 961 ret = unlink_inode_for_log_replay(trans, dir, inode, &name); 962 out: 963 kfree(name.name); 964 if (inode) 965 iput(&inode->vfs_inode); 966 return ret; 967 } 968 969 /* 970 * See if a given name and sequence number found in an inode back reference are 971 * already in a directory and correctly point to this inode. 972 * 973 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 974 * exists. 975 */ 976 static noinline int inode_in_dir(struct btrfs_root *root, 977 struct btrfs_path *path, 978 u64 dirid, u64 objectid, u64 index, 979 struct fscrypt_str *name) 980 { 981 struct btrfs_dir_item *di; 982 struct btrfs_key location; 983 int ret = 0; 984 985 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 986 index, name, 0); 987 if (IS_ERR(di)) { 988 ret = PTR_ERR(di); 989 goto out; 990 } else if (di) { 991 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 992 if (location.objectid != objectid) 993 goto out; 994 } else { 995 goto out; 996 } 997 998 btrfs_release_path(path); 999 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); 1000 if (IS_ERR(di)) { 1001 ret = PTR_ERR(di); 1002 goto out; 1003 } else if (di) { 1004 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1005 if (location.objectid == objectid) 1006 ret = 1; 1007 } 1008 out: 1009 btrfs_release_path(path); 1010 return ret; 1011 } 1012 1013 /* 1014 * helper function to check a log tree for a named back reference in 1015 * an inode. This is used to decide if a back reference that is 1016 * found in the subvolume conflicts with what we find in the log. 1017 * 1018 * inode backreferences may have multiple refs in a single item, 1019 * during replay we process one reference at a time, and we don't 1020 * want to delete valid links to a file from the subvolume if that 1021 * link is also in the log. 1022 */ 1023 static noinline int backref_in_log(struct btrfs_root *log, 1024 struct btrfs_key *key, 1025 u64 ref_objectid, 1026 const struct fscrypt_str *name) 1027 { 1028 struct btrfs_path *path; 1029 int ret; 1030 1031 path = btrfs_alloc_path(); 1032 if (!path) 1033 return -ENOMEM; 1034 1035 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1036 if (ret < 0) { 1037 goto out; 1038 } else if (ret == 1) { 1039 ret = 0; 1040 goto out; 1041 } 1042 1043 if (key->type == BTRFS_INODE_EXTREF_KEY) 1044 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1045 path->slots[0], 1046 ref_objectid, name); 1047 else 1048 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1049 path->slots[0], name); 1050 out: 1051 btrfs_free_path(path); 1052 return ret; 1053 } 1054 1055 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1056 struct btrfs_root *root, 1057 struct btrfs_path *path, 1058 struct btrfs_root *log_root, 1059 struct btrfs_inode *dir, 1060 struct btrfs_inode *inode, 1061 u64 inode_objectid, u64 parent_objectid, 1062 u64 ref_index, struct fscrypt_str *name) 1063 { 1064 int ret; 1065 struct extent_buffer *leaf; 1066 struct btrfs_dir_item *di; 1067 struct btrfs_key search_key; 1068 struct btrfs_inode_extref *extref; 1069 1070 again: 1071 /* Search old style refs */ 1072 search_key.objectid = inode_objectid; 1073 search_key.type = BTRFS_INODE_REF_KEY; 1074 search_key.offset = parent_objectid; 1075 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1076 if (ret == 0) { 1077 struct btrfs_inode_ref *victim_ref; 1078 unsigned long ptr; 1079 unsigned long ptr_end; 1080 1081 leaf = path->nodes[0]; 1082 1083 /* are we trying to overwrite a back ref for the root directory 1084 * if so, just jump out, we're done 1085 */ 1086 if (search_key.objectid == search_key.offset) 1087 return 1; 1088 1089 /* check all the names in this back reference to see 1090 * if they are in the log. if so, we allow them to stay 1091 * otherwise they must be unlinked as a conflict 1092 */ 1093 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1094 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1095 while (ptr < ptr_end) { 1096 struct fscrypt_str victim_name; 1097 1098 victim_ref = (struct btrfs_inode_ref *)ptr; 1099 ret = read_alloc_one_name(leaf, (victim_ref + 1), 1100 btrfs_inode_ref_name_len(leaf, victim_ref), 1101 &victim_name); 1102 if (ret) 1103 return ret; 1104 1105 ret = backref_in_log(log_root, &search_key, 1106 parent_objectid, &victim_name); 1107 if (ret < 0) { 1108 kfree(victim_name.name); 1109 return ret; 1110 } else if (!ret) { 1111 inc_nlink(&inode->vfs_inode); 1112 btrfs_release_path(path); 1113 1114 ret = unlink_inode_for_log_replay(trans, dir, inode, 1115 &victim_name); 1116 kfree(victim_name.name); 1117 if (ret) 1118 return ret; 1119 goto again; 1120 } 1121 kfree(victim_name.name); 1122 1123 ptr = (unsigned long)(victim_ref + 1) + victim_name.len; 1124 } 1125 } 1126 btrfs_release_path(path); 1127 1128 /* Same search but for extended refs */ 1129 extref = btrfs_lookup_inode_extref(NULL, root, path, name, 1130 inode_objectid, parent_objectid, 0, 1131 0); 1132 if (IS_ERR(extref)) { 1133 return PTR_ERR(extref); 1134 } else if (extref) { 1135 u32 item_size; 1136 u32 cur_offset = 0; 1137 unsigned long base; 1138 struct btrfs_inode *victim_parent; 1139 1140 leaf = path->nodes[0]; 1141 1142 item_size = btrfs_item_size(leaf, path->slots[0]); 1143 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1144 1145 while (cur_offset < item_size) { 1146 struct fscrypt_str victim_name; 1147 1148 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1149 1150 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1151 goto next; 1152 1153 ret = read_alloc_one_name(leaf, &extref->name, 1154 btrfs_inode_extref_name_len(leaf, extref), 1155 &victim_name); 1156 if (ret) 1157 return ret; 1158 1159 search_key.objectid = inode_objectid; 1160 search_key.type = BTRFS_INODE_EXTREF_KEY; 1161 search_key.offset = btrfs_extref_hash(parent_objectid, 1162 victim_name.name, 1163 victim_name.len); 1164 ret = backref_in_log(log_root, &search_key, 1165 parent_objectid, &victim_name); 1166 if (ret < 0) { 1167 kfree(victim_name.name); 1168 return ret; 1169 } else if (!ret) { 1170 ret = -ENOENT; 1171 victim_parent = read_one_inode(root, 1172 parent_objectid); 1173 if (victim_parent) { 1174 inc_nlink(&inode->vfs_inode); 1175 btrfs_release_path(path); 1176 1177 ret = unlink_inode_for_log_replay(trans, 1178 victim_parent, 1179 inode, &victim_name); 1180 iput(&victim_parent->vfs_inode); 1181 } 1182 kfree(victim_name.name); 1183 if (ret) 1184 return ret; 1185 goto again; 1186 } 1187 kfree(victim_name.name); 1188 next: 1189 cur_offset += victim_name.len + sizeof(*extref); 1190 } 1191 } 1192 btrfs_release_path(path); 1193 1194 /* look for a conflicting sequence number */ 1195 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1196 ref_index, name, 0); 1197 if (IS_ERR(di)) { 1198 return PTR_ERR(di); 1199 } else if (di) { 1200 ret = drop_one_dir_item(trans, path, dir, di); 1201 if (ret) 1202 return ret; 1203 } 1204 btrfs_release_path(path); 1205 1206 /* look for a conflicting name */ 1207 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); 1208 if (IS_ERR(di)) { 1209 return PTR_ERR(di); 1210 } else if (di) { 1211 ret = drop_one_dir_item(trans, path, dir, di); 1212 if (ret) 1213 return ret; 1214 } 1215 btrfs_release_path(path); 1216 1217 return 0; 1218 } 1219 1220 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1221 struct fscrypt_str *name, u64 *index, 1222 u64 *parent_objectid) 1223 { 1224 struct btrfs_inode_extref *extref; 1225 int ret; 1226 1227 extref = (struct btrfs_inode_extref *)ref_ptr; 1228 1229 ret = read_alloc_one_name(eb, &extref->name, 1230 btrfs_inode_extref_name_len(eb, extref), name); 1231 if (ret) 1232 return ret; 1233 1234 if (index) 1235 *index = btrfs_inode_extref_index(eb, extref); 1236 if (parent_objectid) 1237 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1238 1239 return 0; 1240 } 1241 1242 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1243 struct fscrypt_str *name, u64 *index) 1244 { 1245 struct btrfs_inode_ref *ref; 1246 int ret; 1247 1248 ref = (struct btrfs_inode_ref *)ref_ptr; 1249 1250 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), 1251 name); 1252 if (ret) 1253 return ret; 1254 1255 if (index) 1256 *index = btrfs_inode_ref_index(eb, ref); 1257 1258 return 0; 1259 } 1260 1261 /* 1262 * Take an inode reference item from the log tree and iterate all names from the 1263 * inode reference item in the subvolume tree with the same key (if it exists). 1264 * For any name that is not in the inode reference item from the log tree, do a 1265 * proper unlink of that name (that is, remove its entry from the inode 1266 * reference item and both dir index keys). 1267 */ 1268 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1269 struct btrfs_root *root, 1270 struct btrfs_path *path, 1271 struct btrfs_inode *inode, 1272 struct extent_buffer *log_eb, 1273 int log_slot, 1274 struct btrfs_key *key) 1275 { 1276 int ret; 1277 unsigned long ref_ptr; 1278 unsigned long ref_end; 1279 struct extent_buffer *eb; 1280 1281 again: 1282 btrfs_release_path(path); 1283 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1284 if (ret > 0) { 1285 ret = 0; 1286 goto out; 1287 } 1288 if (ret < 0) 1289 goto out; 1290 1291 eb = path->nodes[0]; 1292 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1293 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1294 while (ref_ptr < ref_end) { 1295 struct fscrypt_str name; 1296 u64 parent_id; 1297 1298 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1299 ret = extref_get_fields(eb, ref_ptr, &name, 1300 NULL, &parent_id); 1301 } else { 1302 parent_id = key->offset; 1303 ret = ref_get_fields(eb, ref_ptr, &name, NULL); 1304 } 1305 if (ret) 1306 goto out; 1307 1308 if (key->type == BTRFS_INODE_EXTREF_KEY) 1309 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1310 parent_id, &name); 1311 else 1312 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); 1313 1314 if (!ret) { 1315 struct btrfs_inode *dir; 1316 1317 btrfs_release_path(path); 1318 dir = read_one_inode(root, parent_id); 1319 if (!dir) { 1320 ret = -ENOENT; 1321 kfree(name.name); 1322 goto out; 1323 } 1324 ret = unlink_inode_for_log_replay(trans, dir, inode, &name); 1325 kfree(name.name); 1326 iput(&dir->vfs_inode); 1327 if (ret) 1328 goto out; 1329 goto again; 1330 } 1331 1332 kfree(name.name); 1333 ref_ptr += name.len; 1334 if (key->type == BTRFS_INODE_EXTREF_KEY) 1335 ref_ptr += sizeof(struct btrfs_inode_extref); 1336 else 1337 ref_ptr += sizeof(struct btrfs_inode_ref); 1338 } 1339 ret = 0; 1340 out: 1341 btrfs_release_path(path); 1342 return ret; 1343 } 1344 1345 /* 1346 * replay one inode back reference item found in the log tree. 1347 * eb, slot and key refer to the buffer and key found in the log tree. 1348 * root is the destination we are replaying into, and path is for temp 1349 * use by this function. (it should be released on return). 1350 */ 1351 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1352 struct btrfs_root *root, 1353 struct btrfs_root *log, 1354 struct btrfs_path *path, 1355 struct extent_buffer *eb, int slot, 1356 struct btrfs_key *key) 1357 { 1358 struct btrfs_inode *dir = NULL; 1359 struct btrfs_inode *inode = NULL; 1360 unsigned long ref_ptr; 1361 unsigned long ref_end; 1362 struct fscrypt_str name = { 0 }; 1363 int ret; 1364 int log_ref_ver = 0; 1365 u64 parent_objectid; 1366 u64 inode_objectid; 1367 u64 ref_index = 0; 1368 int ref_struct_size; 1369 1370 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1371 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1372 1373 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1374 struct btrfs_inode_extref *r; 1375 1376 ref_struct_size = sizeof(struct btrfs_inode_extref); 1377 log_ref_ver = 1; 1378 r = (struct btrfs_inode_extref *)ref_ptr; 1379 parent_objectid = btrfs_inode_extref_parent(eb, r); 1380 } else { 1381 ref_struct_size = sizeof(struct btrfs_inode_ref); 1382 parent_objectid = key->offset; 1383 } 1384 inode_objectid = key->objectid; 1385 1386 /* 1387 * it is possible that we didn't log all the parent directories 1388 * for a given inode. If we don't find the dir, just don't 1389 * copy the back ref in. The link count fixup code will take 1390 * care of the rest 1391 */ 1392 dir = read_one_inode(root, parent_objectid); 1393 if (!dir) { 1394 ret = -ENOENT; 1395 goto out; 1396 } 1397 1398 inode = read_one_inode(root, inode_objectid); 1399 if (!inode) { 1400 ret = -EIO; 1401 goto out; 1402 } 1403 1404 while (ref_ptr < ref_end) { 1405 if (log_ref_ver) { 1406 ret = extref_get_fields(eb, ref_ptr, &name, 1407 &ref_index, &parent_objectid); 1408 /* 1409 * parent object can change from one array 1410 * item to another. 1411 */ 1412 if (!dir) 1413 dir = read_one_inode(root, parent_objectid); 1414 if (!dir) { 1415 ret = -ENOENT; 1416 goto out; 1417 } 1418 } else { 1419 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); 1420 } 1421 if (ret) 1422 goto out; 1423 1424 ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1425 ref_index, &name); 1426 if (ret < 0) { 1427 goto out; 1428 } else if (ret == 0) { 1429 /* 1430 * look for a conflicting back reference in the 1431 * metadata. if we find one we have to unlink that name 1432 * of the file before we add our new link. Later on, we 1433 * overwrite any existing back reference, and we don't 1434 * want to create dangling pointers in the directory. 1435 */ 1436 ret = __add_inode_ref(trans, root, path, log, dir, inode, 1437 inode_objectid, parent_objectid, 1438 ref_index, &name); 1439 if (ret) { 1440 if (ret == 1) 1441 ret = 0; 1442 goto out; 1443 } 1444 1445 /* insert our name */ 1446 ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); 1447 if (ret) 1448 goto out; 1449 1450 ret = btrfs_update_inode(trans, inode); 1451 if (ret) 1452 goto out; 1453 } 1454 /* Else, ret == 1, we already have a perfect match, we're done. */ 1455 1456 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; 1457 kfree(name.name); 1458 name.name = NULL; 1459 if (log_ref_ver) { 1460 iput(&dir->vfs_inode); 1461 dir = NULL; 1462 } 1463 } 1464 1465 /* 1466 * Before we overwrite the inode reference item in the subvolume tree 1467 * with the item from the log tree, we must unlink all names from the 1468 * parent directory that are in the subvolume's tree inode reference 1469 * item, otherwise we end up with an inconsistent subvolume tree where 1470 * dir index entries exist for a name but there is no inode reference 1471 * item with the same name. 1472 */ 1473 ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); 1474 if (ret) 1475 goto out; 1476 1477 /* finally write the back reference in the inode */ 1478 ret = overwrite_item(trans, root, path, eb, slot, key); 1479 out: 1480 btrfs_release_path(path); 1481 kfree(name.name); 1482 if (dir) 1483 iput(&dir->vfs_inode); 1484 if (inode) 1485 iput(&inode->vfs_inode); 1486 return ret; 1487 } 1488 1489 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) 1490 { 1491 int ret = 0; 1492 int name_len; 1493 unsigned int nlink = 0; 1494 u32 item_size; 1495 u32 cur_offset = 0; 1496 u64 inode_objectid = btrfs_ino(inode); 1497 u64 offset = 0; 1498 unsigned long ptr; 1499 struct btrfs_inode_extref *extref; 1500 struct extent_buffer *leaf; 1501 1502 while (1) { 1503 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, 1504 path, &extref, &offset); 1505 if (ret) 1506 break; 1507 1508 leaf = path->nodes[0]; 1509 item_size = btrfs_item_size(leaf, path->slots[0]); 1510 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1511 cur_offset = 0; 1512 1513 while (cur_offset < item_size) { 1514 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1515 name_len = btrfs_inode_extref_name_len(leaf, extref); 1516 1517 nlink++; 1518 1519 cur_offset += name_len + sizeof(*extref); 1520 } 1521 1522 offset++; 1523 btrfs_release_path(path); 1524 } 1525 btrfs_release_path(path); 1526 1527 if (ret < 0 && ret != -ENOENT) 1528 return ret; 1529 return nlink; 1530 } 1531 1532 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) 1533 { 1534 int ret; 1535 struct btrfs_key key; 1536 unsigned int nlink = 0; 1537 unsigned long ptr; 1538 unsigned long ptr_end; 1539 int name_len; 1540 u64 ino = btrfs_ino(inode); 1541 1542 key.objectid = ino; 1543 key.type = BTRFS_INODE_REF_KEY; 1544 key.offset = (u64)-1; 1545 1546 while (1) { 1547 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); 1548 if (ret < 0) 1549 break; 1550 if (ret > 0) { 1551 if (path->slots[0] == 0) 1552 break; 1553 path->slots[0]--; 1554 } 1555 process_slot: 1556 btrfs_item_key_to_cpu(path->nodes[0], &key, 1557 path->slots[0]); 1558 if (key.objectid != ino || 1559 key.type != BTRFS_INODE_REF_KEY) 1560 break; 1561 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1562 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1563 path->slots[0]); 1564 while (ptr < ptr_end) { 1565 struct btrfs_inode_ref *ref; 1566 1567 ref = (struct btrfs_inode_ref *)ptr; 1568 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1569 ref); 1570 ptr = (unsigned long)(ref + 1) + name_len; 1571 nlink++; 1572 } 1573 1574 if (key.offset == 0) 1575 break; 1576 if (path->slots[0] > 0) { 1577 path->slots[0]--; 1578 goto process_slot; 1579 } 1580 key.offset--; 1581 btrfs_release_path(path); 1582 } 1583 btrfs_release_path(path); 1584 1585 return nlink; 1586 } 1587 1588 /* 1589 * There are a few corners where the link count of the file can't 1590 * be properly maintained during replay. So, instead of adding 1591 * lots of complexity to the log code, we just scan the backrefs 1592 * for any file that has been through replay. 1593 * 1594 * The scan will update the link count on the inode to reflect the 1595 * number of back refs found. If it goes down to zero, the iput 1596 * will free the inode. 1597 */ 1598 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1599 struct btrfs_inode *inode) 1600 { 1601 struct btrfs_root *root = inode->root; 1602 struct btrfs_path *path; 1603 int ret; 1604 u64 nlink = 0; 1605 const u64 ino = btrfs_ino(inode); 1606 1607 path = btrfs_alloc_path(); 1608 if (!path) 1609 return -ENOMEM; 1610 1611 ret = count_inode_refs(inode, path); 1612 if (ret < 0) 1613 goto out; 1614 1615 nlink = ret; 1616 1617 ret = count_inode_extrefs(inode, path); 1618 if (ret < 0) 1619 goto out; 1620 1621 nlink += ret; 1622 1623 ret = 0; 1624 1625 if (nlink != inode->vfs_inode.i_nlink) { 1626 set_nlink(&inode->vfs_inode, nlink); 1627 ret = btrfs_update_inode(trans, inode); 1628 if (ret) 1629 goto out; 1630 } 1631 if (S_ISDIR(inode->vfs_inode.i_mode)) 1632 inode->index_cnt = (u64)-1; 1633 1634 if (inode->vfs_inode.i_nlink == 0) { 1635 if (S_ISDIR(inode->vfs_inode.i_mode)) { 1636 ret = replay_dir_deletes(trans, root, NULL, path, 1637 ino, 1); 1638 if (ret) 1639 goto out; 1640 } 1641 ret = btrfs_insert_orphan_item(trans, root, ino); 1642 if (ret == -EEXIST) 1643 ret = 0; 1644 } 1645 1646 out: 1647 btrfs_free_path(path); 1648 return ret; 1649 } 1650 1651 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1652 struct btrfs_root *root, 1653 struct btrfs_path *path) 1654 { 1655 int ret; 1656 struct btrfs_key key; 1657 1658 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1659 key.type = BTRFS_ORPHAN_ITEM_KEY; 1660 key.offset = (u64)-1; 1661 while (1) { 1662 struct btrfs_inode *inode; 1663 1664 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1665 if (ret < 0) 1666 break; 1667 1668 if (ret == 1) { 1669 ret = 0; 1670 if (path->slots[0] == 0) 1671 break; 1672 path->slots[0]--; 1673 } 1674 1675 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1676 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1677 key.type != BTRFS_ORPHAN_ITEM_KEY) 1678 break; 1679 1680 ret = btrfs_del_item(trans, root, path); 1681 if (ret) 1682 break; 1683 1684 btrfs_release_path(path); 1685 inode = read_one_inode(root, key.offset); 1686 if (!inode) { 1687 ret = -EIO; 1688 break; 1689 } 1690 1691 ret = fixup_inode_link_count(trans, inode); 1692 iput(&inode->vfs_inode); 1693 if (ret) 1694 break; 1695 1696 /* 1697 * fixup on a directory may create new entries, 1698 * make sure we always look for the highset possible 1699 * offset 1700 */ 1701 key.offset = (u64)-1; 1702 } 1703 btrfs_release_path(path); 1704 return ret; 1705 } 1706 1707 1708 /* 1709 * record a given inode in the fixup dir so we can check its link 1710 * count when replay is done. The link count is incremented here 1711 * so the inode won't go away until we check it 1712 */ 1713 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1714 struct btrfs_root *root, 1715 struct btrfs_path *path, 1716 u64 objectid) 1717 { 1718 struct btrfs_key key; 1719 int ret = 0; 1720 struct btrfs_inode *inode; 1721 struct inode *vfs_inode; 1722 1723 inode = read_one_inode(root, objectid); 1724 if (!inode) 1725 return -EIO; 1726 1727 vfs_inode = &inode->vfs_inode; 1728 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1729 key.type = BTRFS_ORPHAN_ITEM_KEY; 1730 key.offset = objectid; 1731 1732 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1733 1734 btrfs_release_path(path); 1735 if (ret == 0) { 1736 if (!vfs_inode->i_nlink) 1737 set_nlink(vfs_inode, 1); 1738 else 1739 inc_nlink(vfs_inode); 1740 ret = btrfs_update_inode(trans, inode); 1741 } else if (ret == -EEXIST) { 1742 ret = 0; 1743 } 1744 iput(vfs_inode); 1745 1746 return ret; 1747 } 1748 1749 /* 1750 * when replaying the log for a directory, we only insert names 1751 * for inodes that actually exist. This means an fsync on a directory 1752 * does not implicitly fsync all the new files in it 1753 */ 1754 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1755 struct btrfs_root *root, 1756 u64 dirid, u64 index, 1757 const struct fscrypt_str *name, 1758 struct btrfs_key *location) 1759 { 1760 struct btrfs_inode *inode; 1761 struct btrfs_inode *dir; 1762 int ret; 1763 1764 inode = read_one_inode(root, location->objectid); 1765 if (!inode) 1766 return -ENOENT; 1767 1768 dir = read_one_inode(root, dirid); 1769 if (!dir) { 1770 iput(&inode->vfs_inode); 1771 return -EIO; 1772 } 1773 1774 ret = btrfs_add_link(trans, dir, inode, name, 1, index); 1775 1776 /* FIXME, put inode into FIXUP list */ 1777 1778 iput(&inode->vfs_inode); 1779 iput(&dir->vfs_inode); 1780 return ret; 1781 } 1782 1783 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1784 struct btrfs_inode *dir, 1785 struct btrfs_path *path, 1786 struct btrfs_dir_item *dst_di, 1787 const struct btrfs_key *log_key, 1788 u8 log_flags, 1789 bool exists) 1790 { 1791 struct btrfs_key found_key; 1792 1793 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1794 /* The existing dentry points to the same inode, don't delete it. */ 1795 if (found_key.objectid == log_key->objectid && 1796 found_key.type == log_key->type && 1797 found_key.offset == log_key->offset && 1798 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) 1799 return 1; 1800 1801 /* 1802 * Don't drop the conflicting directory entry if the inode for the new 1803 * entry doesn't exist. 1804 */ 1805 if (!exists) 1806 return 0; 1807 1808 return drop_one_dir_item(trans, path, dir, dst_di); 1809 } 1810 1811 /* 1812 * take a single entry in a log directory item and replay it into 1813 * the subvolume. 1814 * 1815 * if a conflicting item exists in the subdirectory already, 1816 * the inode it points to is unlinked and put into the link count 1817 * fix up tree. 1818 * 1819 * If a name from the log points to a file or directory that does 1820 * not exist in the FS, it is skipped. fsyncs on directories 1821 * do not force down inodes inside that directory, just changes to the 1822 * names or unlinks in a directory. 1823 * 1824 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1825 * non-existing inode) and 1 if the name was replayed. 1826 */ 1827 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1828 struct btrfs_root *root, 1829 struct btrfs_path *path, 1830 struct extent_buffer *eb, 1831 struct btrfs_dir_item *di, 1832 struct btrfs_key *key) 1833 { 1834 struct fscrypt_str name = { 0 }; 1835 struct btrfs_dir_item *dir_dst_di; 1836 struct btrfs_dir_item *index_dst_di; 1837 bool dir_dst_matches = false; 1838 bool index_dst_matches = false; 1839 struct btrfs_key log_key; 1840 struct btrfs_key search_key; 1841 struct btrfs_inode *dir; 1842 u8 log_flags; 1843 bool exists; 1844 int ret; 1845 bool update_size = true; 1846 bool name_added = false; 1847 1848 dir = read_one_inode(root, key->objectid); 1849 if (!dir) 1850 return -EIO; 1851 1852 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 1853 if (ret) 1854 goto out; 1855 1856 log_flags = btrfs_dir_flags(eb, di); 1857 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1858 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1859 btrfs_release_path(path); 1860 if (ret < 0) 1861 goto out; 1862 exists = (ret == 0); 1863 ret = 0; 1864 1865 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1866 &name, 1); 1867 if (IS_ERR(dir_dst_di)) { 1868 ret = PTR_ERR(dir_dst_di); 1869 goto out; 1870 } else if (dir_dst_di) { 1871 ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, 1872 &log_key, log_flags, exists); 1873 if (ret < 0) 1874 goto out; 1875 dir_dst_matches = (ret == 1); 1876 } 1877 1878 btrfs_release_path(path); 1879 1880 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1881 key->objectid, key->offset, 1882 &name, 1); 1883 if (IS_ERR(index_dst_di)) { 1884 ret = PTR_ERR(index_dst_di); 1885 goto out; 1886 } else if (index_dst_di) { 1887 ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, 1888 &log_key, log_flags, exists); 1889 if (ret < 0) 1890 goto out; 1891 index_dst_matches = (ret == 1); 1892 } 1893 1894 btrfs_release_path(path); 1895 1896 if (dir_dst_matches && index_dst_matches) { 1897 ret = 0; 1898 update_size = false; 1899 goto out; 1900 } 1901 1902 /* 1903 * Check if the inode reference exists in the log for the given name, 1904 * inode and parent inode 1905 */ 1906 search_key.objectid = log_key.objectid; 1907 search_key.type = BTRFS_INODE_REF_KEY; 1908 search_key.offset = key->objectid; 1909 ret = backref_in_log(root->log_root, &search_key, 0, &name); 1910 if (ret < 0) { 1911 goto out; 1912 } else if (ret) { 1913 /* The dentry will be added later. */ 1914 ret = 0; 1915 update_size = false; 1916 goto out; 1917 } 1918 1919 search_key.objectid = log_key.objectid; 1920 search_key.type = BTRFS_INODE_EXTREF_KEY; 1921 search_key.offset = key->objectid; 1922 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); 1923 if (ret < 0) { 1924 goto out; 1925 } else if (ret) { 1926 /* The dentry will be added later. */ 1927 ret = 0; 1928 update_size = false; 1929 goto out; 1930 } 1931 btrfs_release_path(path); 1932 ret = insert_one_name(trans, root, key->objectid, key->offset, 1933 &name, &log_key); 1934 if (ret && ret != -ENOENT && ret != -EEXIST) 1935 goto out; 1936 if (!ret) 1937 name_added = true; 1938 update_size = false; 1939 ret = 0; 1940 1941 out: 1942 if (!ret && update_size) { 1943 btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); 1944 ret = btrfs_update_inode(trans, dir); 1945 } 1946 kfree(name.name); 1947 iput(&dir->vfs_inode); 1948 if (!ret && name_added) 1949 ret = 1; 1950 return ret; 1951 } 1952 1953 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 1954 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1955 struct btrfs_root *root, 1956 struct btrfs_path *path, 1957 struct extent_buffer *eb, int slot, 1958 struct btrfs_key *key) 1959 { 1960 int ret; 1961 struct btrfs_dir_item *di; 1962 1963 /* We only log dir index keys, which only contain a single dir item. */ 1964 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 1965 1966 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1967 ret = replay_one_name(trans, root, path, eb, di, key); 1968 if (ret < 0) 1969 return ret; 1970 1971 /* 1972 * If this entry refers to a non-directory (directories can not have a 1973 * link count > 1) and it was added in the transaction that was not 1974 * committed, make sure we fixup the link count of the inode the entry 1975 * points to. Otherwise something like the following would result in a 1976 * directory pointing to an inode with a wrong link that does not account 1977 * for this dir entry: 1978 * 1979 * mkdir testdir 1980 * touch testdir/foo 1981 * touch testdir/bar 1982 * sync 1983 * 1984 * ln testdir/bar testdir/bar_link 1985 * ln testdir/foo testdir/foo_link 1986 * xfs_io -c "fsync" testdir/bar 1987 * 1988 * <power failure> 1989 * 1990 * mount fs, log replay happens 1991 * 1992 * File foo would remain with a link count of 1 when it has two entries 1993 * pointing to it in the directory testdir. This would make it impossible 1994 * to ever delete the parent directory has it would result in stale 1995 * dentries that can never be deleted. 1996 */ 1997 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { 1998 struct btrfs_path *fixup_path; 1999 struct btrfs_key di_key; 2000 2001 fixup_path = btrfs_alloc_path(); 2002 if (!fixup_path) 2003 return -ENOMEM; 2004 2005 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2006 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2007 btrfs_free_path(fixup_path); 2008 } 2009 2010 return ret; 2011 } 2012 2013 /* 2014 * directory replay has two parts. There are the standard directory 2015 * items in the log copied from the subvolume, and range items 2016 * created in the log while the subvolume was logged. 2017 * 2018 * The range items tell us which parts of the key space the log 2019 * is authoritative for. During replay, if a key in the subvolume 2020 * directory is in a logged range item, but not actually in the log 2021 * that means it was deleted from the directory before the fsync 2022 * and should be removed. 2023 */ 2024 static noinline int find_dir_range(struct btrfs_root *root, 2025 struct btrfs_path *path, 2026 u64 dirid, 2027 u64 *start_ret, u64 *end_ret) 2028 { 2029 struct btrfs_key key; 2030 u64 found_end; 2031 struct btrfs_dir_log_item *item; 2032 int ret; 2033 int nritems; 2034 2035 if (*start_ret == (u64)-1) 2036 return 1; 2037 2038 key.objectid = dirid; 2039 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2040 key.offset = *start_ret; 2041 2042 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2043 if (ret < 0) 2044 goto out; 2045 if (ret > 0) { 2046 if (path->slots[0] == 0) 2047 goto out; 2048 path->slots[0]--; 2049 } 2050 if (ret != 0) 2051 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2052 2053 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2054 ret = 1; 2055 goto next; 2056 } 2057 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2058 struct btrfs_dir_log_item); 2059 found_end = btrfs_dir_log_end(path->nodes[0], item); 2060 2061 if (*start_ret >= key.offset && *start_ret <= found_end) { 2062 ret = 0; 2063 *start_ret = key.offset; 2064 *end_ret = found_end; 2065 goto out; 2066 } 2067 ret = 1; 2068 next: 2069 /* check the next slot in the tree to see if it is a valid item */ 2070 nritems = btrfs_header_nritems(path->nodes[0]); 2071 path->slots[0]++; 2072 if (path->slots[0] >= nritems) { 2073 ret = btrfs_next_leaf(root, path); 2074 if (ret) 2075 goto out; 2076 } 2077 2078 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2079 2080 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2081 ret = 1; 2082 goto out; 2083 } 2084 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2085 struct btrfs_dir_log_item); 2086 found_end = btrfs_dir_log_end(path->nodes[0], item); 2087 *start_ret = key.offset; 2088 *end_ret = found_end; 2089 ret = 0; 2090 out: 2091 btrfs_release_path(path); 2092 return ret; 2093 } 2094 2095 /* 2096 * this looks for a given directory item in the log. If the directory 2097 * item is not in the log, the item is removed and the inode it points 2098 * to is unlinked 2099 */ 2100 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2101 struct btrfs_root *log, 2102 struct btrfs_path *path, 2103 struct btrfs_path *log_path, 2104 struct btrfs_inode *dir, 2105 struct btrfs_key *dir_key) 2106 { 2107 struct btrfs_root *root = dir->root; 2108 int ret; 2109 struct extent_buffer *eb; 2110 int slot; 2111 struct btrfs_dir_item *di; 2112 struct fscrypt_str name = { 0 }; 2113 struct btrfs_inode *inode = NULL; 2114 struct btrfs_key location; 2115 2116 /* 2117 * Currently we only log dir index keys. Even if we replay a log created 2118 * by an older kernel that logged both dir index and dir item keys, all 2119 * we need to do is process the dir index keys, we (and our caller) can 2120 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2121 */ 2122 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2123 2124 eb = path->nodes[0]; 2125 slot = path->slots[0]; 2126 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2127 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 2128 if (ret) 2129 goto out; 2130 2131 if (log) { 2132 struct btrfs_dir_item *log_di; 2133 2134 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2135 dir_key->objectid, 2136 dir_key->offset, &name, 0); 2137 if (IS_ERR(log_di)) { 2138 ret = PTR_ERR(log_di); 2139 goto out; 2140 } else if (log_di) { 2141 /* The dentry exists in the log, we have nothing to do. */ 2142 ret = 0; 2143 goto out; 2144 } 2145 } 2146 2147 btrfs_dir_item_key_to_cpu(eb, di, &location); 2148 btrfs_release_path(path); 2149 btrfs_release_path(log_path); 2150 inode = read_one_inode(root, location.objectid); 2151 if (!inode) { 2152 ret = -EIO; 2153 goto out; 2154 } 2155 2156 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2157 if (ret) 2158 goto out; 2159 2160 inc_nlink(&inode->vfs_inode); 2161 ret = unlink_inode_for_log_replay(trans, dir, inode, &name); 2162 /* 2163 * Unlike dir item keys, dir index keys can only have one name (entry) in 2164 * them, as there are no key collisions since each key has a unique offset 2165 * (an index number), so we're done. 2166 */ 2167 out: 2168 btrfs_release_path(path); 2169 btrfs_release_path(log_path); 2170 kfree(name.name); 2171 if (inode) 2172 iput(&inode->vfs_inode); 2173 return ret; 2174 } 2175 2176 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2177 struct btrfs_root *root, 2178 struct btrfs_root *log, 2179 struct btrfs_path *path, 2180 const u64 ino) 2181 { 2182 struct btrfs_key search_key; 2183 struct btrfs_path *log_path; 2184 int i; 2185 int nritems; 2186 int ret; 2187 2188 log_path = btrfs_alloc_path(); 2189 if (!log_path) 2190 return -ENOMEM; 2191 2192 search_key.objectid = ino; 2193 search_key.type = BTRFS_XATTR_ITEM_KEY; 2194 search_key.offset = 0; 2195 again: 2196 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2197 if (ret < 0) 2198 goto out; 2199 process_leaf: 2200 nritems = btrfs_header_nritems(path->nodes[0]); 2201 for (i = path->slots[0]; i < nritems; i++) { 2202 struct btrfs_key key; 2203 struct btrfs_dir_item *di; 2204 struct btrfs_dir_item *log_di; 2205 u32 total_size; 2206 u32 cur; 2207 2208 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2209 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2210 ret = 0; 2211 goto out; 2212 } 2213 2214 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2215 total_size = btrfs_item_size(path->nodes[0], i); 2216 cur = 0; 2217 while (cur < total_size) { 2218 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2219 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2220 u32 this_len = sizeof(*di) + name_len + data_len; 2221 char *name; 2222 2223 name = kmalloc(name_len, GFP_NOFS); 2224 if (!name) { 2225 ret = -ENOMEM; 2226 goto out; 2227 } 2228 read_extent_buffer(path->nodes[0], name, 2229 (unsigned long)(di + 1), name_len); 2230 2231 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2232 name, name_len, 0); 2233 btrfs_release_path(log_path); 2234 if (!log_di) { 2235 /* Doesn't exist in log tree, so delete it. */ 2236 btrfs_release_path(path); 2237 di = btrfs_lookup_xattr(trans, root, path, ino, 2238 name, name_len, -1); 2239 kfree(name); 2240 if (IS_ERR(di)) { 2241 ret = PTR_ERR(di); 2242 goto out; 2243 } 2244 ASSERT(di); 2245 ret = btrfs_delete_one_dir_name(trans, root, 2246 path, di); 2247 if (ret) 2248 goto out; 2249 btrfs_release_path(path); 2250 search_key = key; 2251 goto again; 2252 } 2253 kfree(name); 2254 if (IS_ERR(log_di)) { 2255 ret = PTR_ERR(log_di); 2256 goto out; 2257 } 2258 cur += this_len; 2259 di = (struct btrfs_dir_item *)((char *)di + this_len); 2260 } 2261 } 2262 ret = btrfs_next_leaf(root, path); 2263 if (ret > 0) 2264 ret = 0; 2265 else if (ret == 0) 2266 goto process_leaf; 2267 out: 2268 btrfs_free_path(log_path); 2269 btrfs_release_path(path); 2270 return ret; 2271 } 2272 2273 2274 /* 2275 * deletion replay happens before we copy any new directory items 2276 * out of the log or out of backreferences from inodes. It 2277 * scans the log to find ranges of keys that log is authoritative for, 2278 * and then scans the directory to find items in those ranges that are 2279 * not present in the log. 2280 * 2281 * Anything we don't find in the log is unlinked and removed from the 2282 * directory. 2283 */ 2284 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2285 struct btrfs_root *root, 2286 struct btrfs_root *log, 2287 struct btrfs_path *path, 2288 u64 dirid, int del_all) 2289 { 2290 u64 range_start; 2291 u64 range_end; 2292 int ret = 0; 2293 struct btrfs_key dir_key; 2294 struct btrfs_key found_key; 2295 struct btrfs_path *log_path; 2296 struct btrfs_inode *dir; 2297 2298 dir_key.objectid = dirid; 2299 dir_key.type = BTRFS_DIR_INDEX_KEY; 2300 log_path = btrfs_alloc_path(); 2301 if (!log_path) 2302 return -ENOMEM; 2303 2304 dir = read_one_inode(root, dirid); 2305 /* it isn't an error if the inode isn't there, that can happen 2306 * because we replay the deletes before we copy in the inode item 2307 * from the log 2308 */ 2309 if (!dir) { 2310 btrfs_free_path(log_path); 2311 return 0; 2312 } 2313 2314 range_start = 0; 2315 range_end = 0; 2316 while (1) { 2317 if (del_all) 2318 range_end = (u64)-1; 2319 else { 2320 ret = find_dir_range(log, path, dirid, 2321 &range_start, &range_end); 2322 if (ret < 0) 2323 goto out; 2324 else if (ret > 0) 2325 break; 2326 } 2327 2328 dir_key.offset = range_start; 2329 while (1) { 2330 int nritems; 2331 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2332 0, 0); 2333 if (ret < 0) 2334 goto out; 2335 2336 nritems = btrfs_header_nritems(path->nodes[0]); 2337 if (path->slots[0] >= nritems) { 2338 ret = btrfs_next_leaf(root, path); 2339 if (ret == 1) 2340 break; 2341 else if (ret < 0) 2342 goto out; 2343 } 2344 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2345 path->slots[0]); 2346 if (found_key.objectid != dirid || 2347 found_key.type != dir_key.type) { 2348 ret = 0; 2349 goto out; 2350 } 2351 2352 if (found_key.offset > range_end) 2353 break; 2354 2355 ret = check_item_in_log(trans, log, path, 2356 log_path, dir, 2357 &found_key); 2358 if (ret) 2359 goto out; 2360 if (found_key.offset == (u64)-1) 2361 break; 2362 dir_key.offset = found_key.offset + 1; 2363 } 2364 btrfs_release_path(path); 2365 if (range_end == (u64)-1) 2366 break; 2367 range_start = range_end + 1; 2368 } 2369 ret = 0; 2370 out: 2371 btrfs_release_path(path); 2372 btrfs_free_path(log_path); 2373 iput(&dir->vfs_inode); 2374 return ret; 2375 } 2376 2377 /* 2378 * the process_func used to replay items from the log tree. This 2379 * gets called in two different stages. The first stage just looks 2380 * for inodes and makes sure they are all copied into the subvolume. 2381 * 2382 * The second stage copies all the other item types from the log into 2383 * the subvolume. The two stage approach is slower, but gets rid of 2384 * lots of complexity around inodes referencing other inodes that exist 2385 * only in the log (references come from either directory items or inode 2386 * back refs). 2387 */ 2388 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2389 struct walk_control *wc, u64 gen, int level) 2390 { 2391 int nritems; 2392 struct btrfs_tree_parent_check check = { 2393 .transid = gen, 2394 .level = level 2395 }; 2396 struct btrfs_path *path; 2397 struct btrfs_root *root = wc->replay_dest; 2398 struct btrfs_key key; 2399 int i; 2400 int ret; 2401 2402 ret = btrfs_read_extent_buffer(eb, &check); 2403 if (ret) 2404 return ret; 2405 2406 level = btrfs_header_level(eb); 2407 2408 if (level != 0) 2409 return 0; 2410 2411 path = btrfs_alloc_path(); 2412 if (!path) 2413 return -ENOMEM; 2414 2415 nritems = btrfs_header_nritems(eb); 2416 for (i = 0; i < nritems; i++) { 2417 btrfs_item_key_to_cpu(eb, &key, i); 2418 2419 /* inode keys are done during the first stage */ 2420 if (key.type == BTRFS_INODE_ITEM_KEY && 2421 wc->stage == LOG_WALK_REPLAY_INODES) { 2422 struct btrfs_inode_item *inode_item; 2423 u32 mode; 2424 2425 inode_item = btrfs_item_ptr(eb, i, 2426 struct btrfs_inode_item); 2427 /* 2428 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2429 * and never got linked before the fsync, skip it, as 2430 * replaying it is pointless since it would be deleted 2431 * later. We skip logging tmpfiles, but it's always 2432 * possible we are replaying a log created with a kernel 2433 * that used to log tmpfiles. 2434 */ 2435 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2436 wc->ignore_cur_inode = true; 2437 continue; 2438 } else { 2439 wc->ignore_cur_inode = false; 2440 } 2441 ret = replay_xattr_deletes(wc->trans, root, log, 2442 path, key.objectid); 2443 if (ret) 2444 break; 2445 mode = btrfs_inode_mode(eb, inode_item); 2446 if (S_ISDIR(mode)) { 2447 ret = replay_dir_deletes(wc->trans, 2448 root, log, path, key.objectid, 0); 2449 if (ret) 2450 break; 2451 } 2452 ret = overwrite_item(wc->trans, root, path, 2453 eb, i, &key); 2454 if (ret) 2455 break; 2456 2457 /* 2458 * Before replaying extents, truncate the inode to its 2459 * size. We need to do it now and not after log replay 2460 * because before an fsync we can have prealloc extents 2461 * added beyond the inode's i_size. If we did it after, 2462 * through orphan cleanup for example, we would drop 2463 * those prealloc extents just after replaying them. 2464 */ 2465 if (S_ISREG(mode)) { 2466 struct btrfs_drop_extents_args drop_args = { 0 }; 2467 struct btrfs_inode *inode; 2468 u64 from; 2469 2470 inode = read_one_inode(root, key.objectid); 2471 if (!inode) { 2472 ret = -EIO; 2473 break; 2474 } 2475 from = ALIGN(i_size_read(&inode->vfs_inode), 2476 root->fs_info->sectorsize); 2477 drop_args.start = from; 2478 drop_args.end = (u64)-1; 2479 drop_args.drop_cache = true; 2480 ret = btrfs_drop_extents(wc->trans, root, inode, 2481 &drop_args); 2482 if (!ret) { 2483 inode_sub_bytes(&inode->vfs_inode, 2484 drop_args.bytes_found); 2485 /* Update the inode's nbytes. */ 2486 ret = btrfs_update_inode(wc->trans, inode); 2487 } 2488 iput(&inode->vfs_inode); 2489 if (ret) 2490 break; 2491 } 2492 2493 ret = link_to_fixup_dir(wc->trans, root, 2494 path, key.objectid); 2495 if (ret) 2496 break; 2497 } 2498 2499 if (wc->ignore_cur_inode) 2500 continue; 2501 2502 if (key.type == BTRFS_DIR_INDEX_KEY && 2503 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2504 ret = replay_one_dir_item(wc->trans, root, path, 2505 eb, i, &key); 2506 if (ret) 2507 break; 2508 } 2509 2510 if (wc->stage < LOG_WALK_REPLAY_ALL) 2511 continue; 2512 2513 /* these keys are simply copied */ 2514 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2515 ret = overwrite_item(wc->trans, root, path, 2516 eb, i, &key); 2517 if (ret) 2518 break; 2519 } else if (key.type == BTRFS_INODE_REF_KEY || 2520 key.type == BTRFS_INODE_EXTREF_KEY) { 2521 ret = add_inode_ref(wc->trans, root, log, path, 2522 eb, i, &key); 2523 if (ret && ret != -ENOENT) 2524 break; 2525 ret = 0; 2526 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2527 ret = replay_one_extent(wc->trans, root, path, 2528 eb, i, &key); 2529 if (ret) 2530 break; 2531 } 2532 /* 2533 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2534 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2535 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2536 * older kernel with such keys, ignore them. 2537 */ 2538 } 2539 btrfs_free_path(path); 2540 return ret; 2541 } 2542 2543 /* 2544 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2545 */ 2546 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2547 { 2548 struct btrfs_block_group *cache; 2549 2550 cache = btrfs_lookup_block_group(fs_info, start); 2551 if (!cache) { 2552 btrfs_err(fs_info, "unable to find block group for %llu", start); 2553 return; 2554 } 2555 2556 spin_lock(&cache->space_info->lock); 2557 spin_lock(&cache->lock); 2558 cache->reserved -= fs_info->nodesize; 2559 cache->space_info->bytes_reserved -= fs_info->nodesize; 2560 spin_unlock(&cache->lock); 2561 spin_unlock(&cache->space_info->lock); 2562 2563 btrfs_put_block_group(cache); 2564 } 2565 2566 static int clean_log_buffer(struct btrfs_trans_handle *trans, 2567 struct extent_buffer *eb) 2568 { 2569 int ret; 2570 2571 btrfs_tree_lock(eb); 2572 btrfs_clear_buffer_dirty(trans, eb); 2573 wait_on_extent_buffer_writeback(eb); 2574 btrfs_tree_unlock(eb); 2575 2576 if (trans) { 2577 ret = btrfs_pin_reserved_extent(trans, eb); 2578 if (ret) 2579 return ret; 2580 } else { 2581 unaccount_log_buffer(eb->fs_info, eb->start); 2582 } 2583 2584 return 0; 2585 } 2586 2587 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2588 struct btrfs_root *root, 2589 struct btrfs_path *path, int *level, 2590 struct walk_control *wc) 2591 { 2592 struct btrfs_fs_info *fs_info = root->fs_info; 2593 u64 bytenr; 2594 u64 ptr_gen; 2595 struct extent_buffer *next; 2596 struct extent_buffer *cur; 2597 int ret = 0; 2598 2599 while (*level > 0) { 2600 struct btrfs_tree_parent_check check = { 0 }; 2601 2602 cur = path->nodes[*level]; 2603 2604 WARN_ON(btrfs_header_level(cur) != *level); 2605 2606 if (path->slots[*level] >= 2607 btrfs_header_nritems(cur)) 2608 break; 2609 2610 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2611 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2612 check.transid = ptr_gen; 2613 check.level = *level - 1; 2614 check.has_first_key = true; 2615 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); 2616 2617 next = btrfs_find_create_tree_block(fs_info, bytenr, 2618 btrfs_header_owner(cur), 2619 *level - 1); 2620 if (IS_ERR(next)) 2621 return PTR_ERR(next); 2622 2623 if (*level == 1) { 2624 ret = wc->process_func(root, next, wc, ptr_gen, 2625 *level - 1); 2626 if (ret) { 2627 free_extent_buffer(next); 2628 return ret; 2629 } 2630 2631 path->slots[*level]++; 2632 if (wc->free) { 2633 ret = btrfs_read_extent_buffer(next, &check); 2634 if (ret) { 2635 free_extent_buffer(next); 2636 return ret; 2637 } 2638 2639 ret = clean_log_buffer(trans, next); 2640 if (ret) { 2641 free_extent_buffer(next); 2642 return ret; 2643 } 2644 } 2645 free_extent_buffer(next); 2646 continue; 2647 } 2648 ret = btrfs_read_extent_buffer(next, &check); 2649 if (ret) { 2650 free_extent_buffer(next); 2651 return ret; 2652 } 2653 2654 if (path->nodes[*level-1]) 2655 free_extent_buffer(path->nodes[*level-1]); 2656 path->nodes[*level-1] = next; 2657 *level = btrfs_header_level(next); 2658 path->slots[*level] = 0; 2659 cond_resched(); 2660 } 2661 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2662 2663 cond_resched(); 2664 return 0; 2665 } 2666 2667 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2668 struct btrfs_root *root, 2669 struct btrfs_path *path, int *level, 2670 struct walk_control *wc) 2671 { 2672 int i; 2673 int slot; 2674 int ret; 2675 2676 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2677 slot = path->slots[i]; 2678 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2679 path->slots[i]++; 2680 *level = i; 2681 WARN_ON(*level == 0); 2682 return 0; 2683 } else { 2684 ret = wc->process_func(root, path->nodes[*level], wc, 2685 btrfs_header_generation(path->nodes[*level]), 2686 *level); 2687 if (ret) 2688 return ret; 2689 2690 if (wc->free) { 2691 ret = clean_log_buffer(trans, path->nodes[*level]); 2692 if (ret) 2693 return ret; 2694 } 2695 free_extent_buffer(path->nodes[*level]); 2696 path->nodes[*level] = NULL; 2697 *level = i + 1; 2698 } 2699 } 2700 return 1; 2701 } 2702 2703 /* 2704 * drop the reference count on the tree rooted at 'snap'. This traverses 2705 * the tree freeing any blocks that have a ref count of zero after being 2706 * decremented. 2707 */ 2708 static int walk_log_tree(struct btrfs_trans_handle *trans, 2709 struct btrfs_root *log, struct walk_control *wc) 2710 { 2711 int ret = 0; 2712 int wret; 2713 int level; 2714 struct btrfs_path *path; 2715 int orig_level; 2716 2717 path = btrfs_alloc_path(); 2718 if (!path) 2719 return -ENOMEM; 2720 2721 level = btrfs_header_level(log->node); 2722 orig_level = level; 2723 path->nodes[level] = log->node; 2724 atomic_inc(&log->node->refs); 2725 path->slots[level] = 0; 2726 2727 while (1) { 2728 wret = walk_down_log_tree(trans, log, path, &level, wc); 2729 if (wret > 0) 2730 break; 2731 if (wret < 0) { 2732 ret = wret; 2733 goto out; 2734 } 2735 2736 wret = walk_up_log_tree(trans, log, path, &level, wc); 2737 if (wret > 0) 2738 break; 2739 if (wret < 0) { 2740 ret = wret; 2741 goto out; 2742 } 2743 } 2744 2745 /* was the root node processed? if not, catch it here */ 2746 if (path->nodes[orig_level]) { 2747 ret = wc->process_func(log, path->nodes[orig_level], wc, 2748 btrfs_header_generation(path->nodes[orig_level]), 2749 orig_level); 2750 if (ret) 2751 goto out; 2752 if (wc->free) 2753 ret = clean_log_buffer(trans, path->nodes[orig_level]); 2754 } 2755 2756 out: 2757 btrfs_free_path(path); 2758 return ret; 2759 } 2760 2761 /* 2762 * helper function to update the item for a given subvolumes log root 2763 * in the tree of log roots 2764 */ 2765 static int update_log_root(struct btrfs_trans_handle *trans, 2766 struct btrfs_root *log, 2767 struct btrfs_root_item *root_item) 2768 { 2769 struct btrfs_fs_info *fs_info = log->fs_info; 2770 int ret; 2771 2772 if (log->log_transid == 1) { 2773 /* insert root item on the first sync */ 2774 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2775 &log->root_key, root_item); 2776 } else { 2777 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2778 &log->root_key, root_item); 2779 } 2780 return ret; 2781 } 2782 2783 static void wait_log_commit(struct btrfs_root *root, int transid) 2784 { 2785 DEFINE_WAIT(wait); 2786 int index = transid % 2; 2787 2788 /* 2789 * we only allow two pending log transactions at a time, 2790 * so we know that if ours is more than 2 older than the 2791 * current transaction, we're done 2792 */ 2793 for (;;) { 2794 prepare_to_wait(&root->log_commit_wait[index], 2795 &wait, TASK_UNINTERRUPTIBLE); 2796 2797 if (!(root->log_transid_committed < transid && 2798 atomic_read(&root->log_commit[index]))) 2799 break; 2800 2801 mutex_unlock(&root->log_mutex); 2802 schedule(); 2803 mutex_lock(&root->log_mutex); 2804 } 2805 finish_wait(&root->log_commit_wait[index], &wait); 2806 } 2807 2808 static void wait_for_writer(struct btrfs_root *root) 2809 { 2810 DEFINE_WAIT(wait); 2811 2812 for (;;) { 2813 prepare_to_wait(&root->log_writer_wait, &wait, 2814 TASK_UNINTERRUPTIBLE); 2815 if (!atomic_read(&root->log_writers)) 2816 break; 2817 2818 mutex_unlock(&root->log_mutex); 2819 schedule(); 2820 mutex_lock(&root->log_mutex); 2821 } 2822 finish_wait(&root->log_writer_wait, &wait); 2823 } 2824 2825 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct btrfs_inode *inode) 2826 { 2827 ctx->log_ret = 0; 2828 ctx->log_transid = 0; 2829 ctx->log_new_dentries = false; 2830 ctx->logging_new_name = false; 2831 ctx->logging_new_delayed_dentries = false; 2832 ctx->logged_before = false; 2833 ctx->inode = inode; 2834 INIT_LIST_HEAD(&ctx->list); 2835 INIT_LIST_HEAD(&ctx->ordered_extents); 2836 INIT_LIST_HEAD(&ctx->conflict_inodes); 2837 ctx->num_conflict_inodes = 0; 2838 ctx->logging_conflict_inodes = false; 2839 ctx->scratch_eb = NULL; 2840 } 2841 2842 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx) 2843 { 2844 struct btrfs_inode *inode = ctx->inode; 2845 2846 if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && 2847 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 2848 return; 2849 2850 /* 2851 * Don't care about allocation failure. This is just for optimization, 2852 * if we fail to allocate here, we will try again later if needed. 2853 */ 2854 ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0); 2855 } 2856 2857 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) 2858 { 2859 struct btrfs_ordered_extent *ordered; 2860 struct btrfs_ordered_extent *tmp; 2861 2862 btrfs_assert_inode_locked(ctx->inode); 2863 2864 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 2865 list_del_init(&ordered->log_list); 2866 btrfs_put_ordered_extent(ordered); 2867 } 2868 } 2869 2870 2871 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2872 struct btrfs_log_ctx *ctx) 2873 { 2874 mutex_lock(&root->log_mutex); 2875 list_del_init(&ctx->list); 2876 mutex_unlock(&root->log_mutex); 2877 } 2878 2879 /* 2880 * Invoked in log mutex context, or be sure there is no other task which 2881 * can access the list. 2882 */ 2883 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2884 int index, int error) 2885 { 2886 struct btrfs_log_ctx *ctx; 2887 struct btrfs_log_ctx *safe; 2888 2889 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2890 list_del_init(&ctx->list); 2891 ctx->log_ret = error; 2892 } 2893 } 2894 2895 /* 2896 * Sends a given tree log down to the disk and updates the super blocks to 2897 * record it. When this call is done, you know that any inodes previously 2898 * logged are safely on disk only if it returns 0. 2899 * 2900 * Any other return value means you need to call btrfs_commit_transaction. 2901 * Some of the edge cases for fsyncing directories that have had unlinks 2902 * or renames done in the past mean that sometimes the only safe 2903 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2904 * that has happened. 2905 */ 2906 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2907 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2908 { 2909 int index1; 2910 int index2; 2911 int mark; 2912 int ret; 2913 struct btrfs_fs_info *fs_info = root->fs_info; 2914 struct btrfs_root *log = root->log_root; 2915 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2916 struct btrfs_root_item new_root_item; 2917 int log_transid = 0; 2918 struct btrfs_log_ctx root_log_ctx; 2919 struct blk_plug plug; 2920 u64 log_root_start; 2921 u64 log_root_level; 2922 2923 mutex_lock(&root->log_mutex); 2924 log_transid = ctx->log_transid; 2925 if (root->log_transid_committed >= log_transid) { 2926 mutex_unlock(&root->log_mutex); 2927 return ctx->log_ret; 2928 } 2929 2930 index1 = log_transid % 2; 2931 if (atomic_read(&root->log_commit[index1])) { 2932 wait_log_commit(root, log_transid); 2933 mutex_unlock(&root->log_mutex); 2934 return ctx->log_ret; 2935 } 2936 ASSERT(log_transid == root->log_transid); 2937 atomic_set(&root->log_commit[index1], 1); 2938 2939 /* wait for previous tree log sync to complete */ 2940 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2941 wait_log_commit(root, log_transid - 1); 2942 2943 while (1) { 2944 int batch = atomic_read(&root->log_batch); 2945 /* when we're on an ssd, just kick the log commit out */ 2946 if (!btrfs_test_opt(fs_info, SSD) && 2947 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2948 mutex_unlock(&root->log_mutex); 2949 schedule_timeout_uninterruptible(1); 2950 mutex_lock(&root->log_mutex); 2951 } 2952 wait_for_writer(root); 2953 if (batch == atomic_read(&root->log_batch)) 2954 break; 2955 } 2956 2957 /* bail out if we need to do a full commit */ 2958 if (btrfs_need_log_full_commit(trans)) { 2959 ret = BTRFS_LOG_FORCE_COMMIT; 2960 mutex_unlock(&root->log_mutex); 2961 goto out; 2962 } 2963 2964 if (log_transid % 2 == 0) 2965 mark = EXTENT_DIRTY; 2966 else 2967 mark = EXTENT_NEW; 2968 2969 /* we start IO on all the marked extents here, but we don't actually 2970 * wait for them until later. 2971 */ 2972 blk_start_plug(&plug); 2973 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2974 /* 2975 * -EAGAIN happens when someone, e.g., a concurrent transaction 2976 * commit, writes a dirty extent in this tree-log commit. This 2977 * concurrent write will create a hole writing out the extents, 2978 * and we cannot proceed on a zoned filesystem, requiring 2979 * sequential writing. While we can bail out to a full commit 2980 * here, but we can continue hoping the concurrent writing fills 2981 * the hole. 2982 */ 2983 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 2984 ret = 0; 2985 if (ret) { 2986 blk_finish_plug(&plug); 2987 btrfs_set_log_full_commit(trans); 2988 mutex_unlock(&root->log_mutex); 2989 goto out; 2990 } 2991 2992 /* 2993 * We _must_ update under the root->log_mutex in order to make sure we 2994 * have a consistent view of the log root we are trying to commit at 2995 * this moment. 2996 * 2997 * We _must_ copy this into a local copy, because we are not holding the 2998 * log_root_tree->log_mutex yet. This is important because when we 2999 * commit the log_root_tree we must have a consistent view of the 3000 * log_root_tree when we update the super block to point at the 3001 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3002 * with the commit and possibly point at the new block which we may not 3003 * have written out. 3004 */ 3005 btrfs_set_root_node(&log->root_item, log->node); 3006 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3007 3008 btrfs_set_root_log_transid(root, root->log_transid + 1); 3009 log->log_transid = root->log_transid; 3010 root->log_start_pid = 0; 3011 /* 3012 * IO has been started, blocks of the log tree have WRITTEN flag set 3013 * in their headers. new modifications of the log will be written to 3014 * new positions. so it's safe to allow log writers to go in. 3015 */ 3016 mutex_unlock(&root->log_mutex); 3017 3018 if (btrfs_is_zoned(fs_info)) { 3019 mutex_lock(&fs_info->tree_root->log_mutex); 3020 if (!log_root_tree->node) { 3021 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3022 if (ret) { 3023 mutex_unlock(&fs_info->tree_root->log_mutex); 3024 blk_finish_plug(&plug); 3025 goto out; 3026 } 3027 } 3028 mutex_unlock(&fs_info->tree_root->log_mutex); 3029 } 3030 3031 btrfs_init_log_ctx(&root_log_ctx, NULL); 3032 3033 mutex_lock(&log_root_tree->log_mutex); 3034 3035 index2 = log_root_tree->log_transid % 2; 3036 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3037 root_log_ctx.log_transid = log_root_tree->log_transid; 3038 3039 /* 3040 * Now we are safe to update the log_root_tree because we're under the 3041 * log_mutex, and we're a current writer so we're holding the commit 3042 * open until we drop the log_mutex. 3043 */ 3044 ret = update_log_root(trans, log, &new_root_item); 3045 if (ret) { 3046 list_del_init(&root_log_ctx.list); 3047 blk_finish_plug(&plug); 3048 btrfs_set_log_full_commit(trans); 3049 if (ret != -ENOSPC) 3050 btrfs_err(fs_info, 3051 "failed to update log for root %llu ret %d", 3052 btrfs_root_id(root), ret); 3053 btrfs_wait_tree_log_extents(log, mark); 3054 mutex_unlock(&log_root_tree->log_mutex); 3055 goto out; 3056 } 3057 3058 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3059 blk_finish_plug(&plug); 3060 list_del_init(&root_log_ctx.list); 3061 mutex_unlock(&log_root_tree->log_mutex); 3062 ret = root_log_ctx.log_ret; 3063 goto out; 3064 } 3065 3066 if (atomic_read(&log_root_tree->log_commit[index2])) { 3067 blk_finish_plug(&plug); 3068 ret = btrfs_wait_tree_log_extents(log, mark); 3069 wait_log_commit(log_root_tree, 3070 root_log_ctx.log_transid); 3071 mutex_unlock(&log_root_tree->log_mutex); 3072 if (!ret) 3073 ret = root_log_ctx.log_ret; 3074 goto out; 3075 } 3076 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3077 atomic_set(&log_root_tree->log_commit[index2], 1); 3078 3079 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3080 wait_log_commit(log_root_tree, 3081 root_log_ctx.log_transid - 1); 3082 } 3083 3084 /* 3085 * now that we've moved on to the tree of log tree roots, 3086 * check the full commit flag again 3087 */ 3088 if (btrfs_need_log_full_commit(trans)) { 3089 blk_finish_plug(&plug); 3090 btrfs_wait_tree_log_extents(log, mark); 3091 mutex_unlock(&log_root_tree->log_mutex); 3092 ret = BTRFS_LOG_FORCE_COMMIT; 3093 goto out_wake_log_root; 3094 } 3095 3096 ret = btrfs_write_marked_extents(fs_info, 3097 &log_root_tree->dirty_log_pages, 3098 EXTENT_DIRTY | EXTENT_NEW); 3099 blk_finish_plug(&plug); 3100 /* 3101 * As described above, -EAGAIN indicates a hole in the extents. We 3102 * cannot wait for these write outs since the waiting cause a 3103 * deadlock. Bail out to the full commit instead. 3104 */ 3105 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3106 btrfs_set_log_full_commit(trans); 3107 btrfs_wait_tree_log_extents(log, mark); 3108 mutex_unlock(&log_root_tree->log_mutex); 3109 goto out_wake_log_root; 3110 } else if (ret) { 3111 btrfs_set_log_full_commit(trans); 3112 mutex_unlock(&log_root_tree->log_mutex); 3113 goto out_wake_log_root; 3114 } 3115 ret = btrfs_wait_tree_log_extents(log, mark); 3116 if (!ret) 3117 ret = btrfs_wait_tree_log_extents(log_root_tree, 3118 EXTENT_NEW | EXTENT_DIRTY); 3119 if (ret) { 3120 btrfs_set_log_full_commit(trans); 3121 mutex_unlock(&log_root_tree->log_mutex); 3122 goto out_wake_log_root; 3123 } 3124 3125 log_root_start = log_root_tree->node->start; 3126 log_root_level = btrfs_header_level(log_root_tree->node); 3127 log_root_tree->log_transid++; 3128 mutex_unlock(&log_root_tree->log_mutex); 3129 3130 /* 3131 * Here we are guaranteed that nobody is going to write the superblock 3132 * for the current transaction before us and that neither we do write 3133 * our superblock before the previous transaction finishes its commit 3134 * and writes its superblock, because: 3135 * 3136 * 1) We are holding a handle on the current transaction, so no body 3137 * can commit it until we release the handle; 3138 * 3139 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3140 * if the previous transaction is still committing, and hasn't yet 3141 * written its superblock, we wait for it to do it, because a 3142 * transaction commit acquires the tree_log_mutex when the commit 3143 * begins and releases it only after writing its superblock. 3144 */ 3145 mutex_lock(&fs_info->tree_log_mutex); 3146 3147 /* 3148 * The previous transaction writeout phase could have failed, and thus 3149 * marked the fs in an error state. We must not commit here, as we 3150 * could have updated our generation in the super_for_commit and 3151 * writing the super here would result in transid mismatches. If there 3152 * is an error here just bail. 3153 */ 3154 if (BTRFS_FS_ERROR(fs_info)) { 3155 ret = -EIO; 3156 btrfs_set_log_full_commit(trans); 3157 btrfs_abort_transaction(trans, ret); 3158 mutex_unlock(&fs_info->tree_log_mutex); 3159 goto out_wake_log_root; 3160 } 3161 3162 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3163 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3164 ret = write_all_supers(fs_info, 1); 3165 mutex_unlock(&fs_info->tree_log_mutex); 3166 if (ret) { 3167 btrfs_set_log_full_commit(trans); 3168 btrfs_abort_transaction(trans, ret); 3169 goto out_wake_log_root; 3170 } 3171 3172 /* 3173 * We know there can only be one task here, since we have not yet set 3174 * root->log_commit[index1] to 0 and any task attempting to sync the 3175 * log must wait for the previous log transaction to commit if it's 3176 * still in progress or wait for the current log transaction commit if 3177 * someone else already started it. We use <= and not < because the 3178 * first log transaction has an ID of 0. 3179 */ 3180 ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); 3181 btrfs_set_root_last_log_commit(root, log_transid); 3182 3183 out_wake_log_root: 3184 mutex_lock(&log_root_tree->log_mutex); 3185 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3186 3187 log_root_tree->log_transid_committed++; 3188 atomic_set(&log_root_tree->log_commit[index2], 0); 3189 mutex_unlock(&log_root_tree->log_mutex); 3190 3191 /* 3192 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3193 * all the updates above are seen by the woken threads. It might not be 3194 * necessary, but proving that seems to be hard. 3195 */ 3196 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3197 out: 3198 mutex_lock(&root->log_mutex); 3199 btrfs_remove_all_log_ctxs(root, index1, ret); 3200 root->log_transid_committed++; 3201 atomic_set(&root->log_commit[index1], 0); 3202 mutex_unlock(&root->log_mutex); 3203 3204 /* 3205 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3206 * all the updates above are seen by the woken threads. It might not be 3207 * necessary, but proving that seems to be hard. 3208 */ 3209 cond_wake_up(&root->log_commit_wait[index1]); 3210 return ret; 3211 } 3212 3213 static void free_log_tree(struct btrfs_trans_handle *trans, 3214 struct btrfs_root *log) 3215 { 3216 int ret; 3217 struct walk_control wc = { 3218 .free = 1, 3219 .process_func = process_one_buffer 3220 }; 3221 3222 if (log->node) { 3223 ret = walk_log_tree(trans, log, &wc); 3224 if (ret) { 3225 /* 3226 * We weren't able to traverse the entire log tree, the 3227 * typical scenario is getting an -EIO when reading an 3228 * extent buffer of the tree, due to a previous writeback 3229 * failure of it. 3230 */ 3231 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3232 &log->fs_info->fs_state); 3233 3234 /* 3235 * Some extent buffers of the log tree may still be dirty 3236 * and not yet written back to storage, because we may 3237 * have updates to a log tree without syncing a log tree, 3238 * such as during rename and link operations. So flush 3239 * them out and wait for their writeback to complete, so 3240 * that we properly cleanup their state and pages. 3241 */ 3242 btrfs_write_marked_extents(log->fs_info, 3243 &log->dirty_log_pages, 3244 EXTENT_DIRTY | EXTENT_NEW); 3245 btrfs_wait_tree_log_extents(log, 3246 EXTENT_DIRTY | EXTENT_NEW); 3247 3248 if (trans) 3249 btrfs_abort_transaction(trans, ret); 3250 else 3251 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3252 } 3253 } 3254 3255 btrfs_extent_io_tree_release(&log->dirty_log_pages); 3256 btrfs_extent_io_tree_release(&log->log_csum_range); 3257 3258 btrfs_put_root(log); 3259 } 3260 3261 /* 3262 * free all the extents used by the tree log. This should be called 3263 * at commit time of the full transaction 3264 */ 3265 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3266 { 3267 if (root->log_root) { 3268 free_log_tree(trans, root->log_root); 3269 root->log_root = NULL; 3270 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3271 } 3272 return 0; 3273 } 3274 3275 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3276 struct btrfs_fs_info *fs_info) 3277 { 3278 if (fs_info->log_root_tree) { 3279 free_log_tree(trans, fs_info->log_root_tree); 3280 fs_info->log_root_tree = NULL; 3281 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3282 } 3283 return 0; 3284 } 3285 3286 /* 3287 * Check if an inode was logged in the current transaction. This correctly deals 3288 * with the case where the inode was logged but has a logged_trans of 0, which 3289 * happens if the inode is evicted and loaded again, as logged_trans is an in 3290 * memory only field (not persisted). 3291 * 3292 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3293 * and < 0 on error. 3294 */ 3295 static int inode_logged(const struct btrfs_trans_handle *trans, 3296 struct btrfs_inode *inode, 3297 struct btrfs_path *path_in) 3298 { 3299 struct btrfs_path *path = path_in; 3300 struct btrfs_key key; 3301 int ret; 3302 3303 if (inode->logged_trans == trans->transid) 3304 return 1; 3305 3306 /* 3307 * If logged_trans is not 0, then we know the inode logged was not logged 3308 * in this transaction, so we can return false right away. 3309 */ 3310 if (inode->logged_trans > 0) 3311 return 0; 3312 3313 /* 3314 * If no log tree was created for this root in this transaction, then 3315 * the inode can not have been logged in this transaction. In that case 3316 * set logged_trans to anything greater than 0 and less than the current 3317 * transaction's ID, to avoid the search below in a future call in case 3318 * a log tree gets created after this. 3319 */ 3320 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3321 inode->logged_trans = trans->transid - 1; 3322 return 0; 3323 } 3324 3325 /* 3326 * We have a log tree and the inode's logged_trans is 0. We can't tell 3327 * for sure if the inode was logged before in this transaction by looking 3328 * only at logged_trans. We could be pessimistic and assume it was, but 3329 * that can lead to unnecessarily logging an inode during rename and link 3330 * operations, and then further updating the log in followup rename and 3331 * link operations, specially if it's a directory, which adds latency 3332 * visible to applications doing a series of rename or link operations. 3333 * 3334 * A logged_trans of 0 here can mean several things: 3335 * 3336 * 1) The inode was never logged since the filesystem was mounted, and may 3337 * or may have not been evicted and loaded again; 3338 * 3339 * 2) The inode was logged in a previous transaction, then evicted and 3340 * then loaded again; 3341 * 3342 * 3) The inode was logged in the current transaction, then evicted and 3343 * then loaded again. 3344 * 3345 * For cases 1) and 2) we don't want to return true, but we need to detect 3346 * case 3) and return true. So we do a search in the log root for the inode 3347 * item. 3348 */ 3349 key.objectid = btrfs_ino(inode); 3350 key.type = BTRFS_INODE_ITEM_KEY; 3351 key.offset = 0; 3352 3353 if (!path) { 3354 path = btrfs_alloc_path(); 3355 if (!path) 3356 return -ENOMEM; 3357 } 3358 3359 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3360 3361 if (path_in) 3362 btrfs_release_path(path); 3363 else 3364 btrfs_free_path(path); 3365 3366 /* 3367 * Logging an inode always results in logging its inode item. So if we 3368 * did not find the item we know the inode was not logged for sure. 3369 */ 3370 if (ret < 0) { 3371 return ret; 3372 } else if (ret > 0) { 3373 /* 3374 * Set logged_trans to a value greater than 0 and less then the 3375 * current transaction to avoid doing the search in future calls. 3376 */ 3377 inode->logged_trans = trans->transid - 1; 3378 return 0; 3379 } 3380 3381 /* 3382 * The inode was previously logged and then evicted, set logged_trans to 3383 * the current transacion's ID, to avoid future tree searches as long as 3384 * the inode is not evicted again. 3385 */ 3386 inode->logged_trans = trans->transid; 3387 3388 /* 3389 * If it's a directory, then we must set last_dir_index_offset to the 3390 * maximum possible value, so that the next attempt to log the inode does 3391 * not skip checking if dir index keys found in modified subvolume tree 3392 * leaves have been logged before, otherwise it would result in attempts 3393 * to insert duplicate dir index keys in the log tree. This must be done 3394 * because last_dir_index_offset is an in-memory only field, not persisted 3395 * in the inode item or any other on-disk structure, so its value is lost 3396 * once the inode is evicted. 3397 */ 3398 if (S_ISDIR(inode->vfs_inode.i_mode)) 3399 inode->last_dir_index_offset = (u64)-1; 3400 3401 return 1; 3402 } 3403 3404 /* 3405 * Delete a directory entry from the log if it exists. 3406 * 3407 * Returns < 0 on error 3408 * 1 if the entry does not exists 3409 * 0 if the entry existed and was successfully deleted 3410 */ 3411 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3412 struct btrfs_root *log, 3413 struct btrfs_path *path, 3414 u64 dir_ino, 3415 const struct fscrypt_str *name, 3416 u64 index) 3417 { 3418 struct btrfs_dir_item *di; 3419 3420 /* 3421 * We only log dir index items of a directory, so we don't need to look 3422 * for dir item keys. 3423 */ 3424 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3425 index, name, -1); 3426 if (IS_ERR(di)) 3427 return PTR_ERR(di); 3428 else if (!di) 3429 return 1; 3430 3431 /* 3432 * We do not need to update the size field of the directory's 3433 * inode item because on log replay we update the field to reflect 3434 * all existing entries in the directory (see overwrite_item()). 3435 */ 3436 return btrfs_delete_one_dir_name(trans, log, path, di); 3437 } 3438 3439 /* 3440 * If both a file and directory are logged, and unlinks or renames are 3441 * mixed in, we have a few interesting corners: 3442 * 3443 * create file X in dir Y 3444 * link file X to X.link in dir Y 3445 * fsync file X 3446 * unlink file X but leave X.link 3447 * fsync dir Y 3448 * 3449 * After a crash we would expect only X.link to exist. But file X 3450 * didn't get fsync'd again so the log has back refs for X and X.link. 3451 * 3452 * We solve this by removing directory entries and inode backrefs from the 3453 * log when a file that was logged in the current transaction is 3454 * unlinked. Any later fsync will include the updated log entries, and 3455 * we'll be able to reconstruct the proper directory items from backrefs. 3456 * 3457 * This optimizations allows us to avoid relogging the entire inode 3458 * or the entire directory. 3459 */ 3460 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3461 struct btrfs_root *root, 3462 const struct fscrypt_str *name, 3463 struct btrfs_inode *dir, u64 index) 3464 { 3465 struct btrfs_path *path; 3466 int ret; 3467 3468 ret = inode_logged(trans, dir, NULL); 3469 if (ret == 0) 3470 return; 3471 else if (ret < 0) { 3472 btrfs_set_log_full_commit(trans); 3473 return; 3474 } 3475 3476 ret = join_running_log_trans(root); 3477 if (ret) 3478 return; 3479 3480 mutex_lock(&dir->log_mutex); 3481 3482 path = btrfs_alloc_path(); 3483 if (!path) { 3484 ret = -ENOMEM; 3485 goto out_unlock; 3486 } 3487 3488 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3489 name, index); 3490 btrfs_free_path(path); 3491 out_unlock: 3492 mutex_unlock(&dir->log_mutex); 3493 if (ret < 0) 3494 btrfs_set_log_full_commit(trans); 3495 btrfs_end_log_trans(root); 3496 } 3497 3498 /* see comments for btrfs_del_dir_entries_in_log */ 3499 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3500 struct btrfs_root *root, 3501 const struct fscrypt_str *name, 3502 struct btrfs_inode *inode, u64 dirid) 3503 { 3504 struct btrfs_root *log; 3505 u64 index; 3506 int ret; 3507 3508 ret = inode_logged(trans, inode, NULL); 3509 if (ret == 0) 3510 return; 3511 else if (ret < 0) { 3512 btrfs_set_log_full_commit(trans); 3513 return; 3514 } 3515 3516 ret = join_running_log_trans(root); 3517 if (ret) 3518 return; 3519 log = root->log_root; 3520 mutex_lock(&inode->log_mutex); 3521 3522 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), 3523 dirid, &index); 3524 mutex_unlock(&inode->log_mutex); 3525 if (ret < 0 && ret != -ENOENT) 3526 btrfs_set_log_full_commit(trans); 3527 btrfs_end_log_trans(root); 3528 } 3529 3530 /* 3531 * creates a range item in the log for 'dirid'. first_offset and 3532 * last_offset tell us which parts of the key space the log should 3533 * be considered authoritative for. 3534 */ 3535 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3536 struct btrfs_root *log, 3537 struct btrfs_path *path, 3538 u64 dirid, 3539 u64 first_offset, u64 last_offset) 3540 { 3541 int ret; 3542 struct btrfs_key key; 3543 struct btrfs_dir_log_item *item; 3544 3545 key.objectid = dirid; 3546 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3547 key.offset = first_offset; 3548 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3549 /* 3550 * -EEXIST is fine and can happen sporadically when we are logging a 3551 * directory and have concurrent insertions in the subvolume's tree for 3552 * items from other inodes and that result in pushing off some dir items 3553 * from one leaf to another in order to accommodate for the new items. 3554 * This results in logging the same dir index range key. 3555 */ 3556 if (ret && ret != -EEXIST) 3557 return ret; 3558 3559 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3560 struct btrfs_dir_log_item); 3561 if (ret == -EEXIST) { 3562 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3563 3564 /* 3565 * btrfs_del_dir_entries_in_log() might have been called during 3566 * an unlink between the initial insertion of this key and the 3567 * current update, or we might be logging a single entry deletion 3568 * during a rename, so set the new last_offset to the max value. 3569 */ 3570 last_offset = max(last_offset, curr_end); 3571 } 3572 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3573 btrfs_release_path(path); 3574 return 0; 3575 } 3576 3577 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3578 struct btrfs_inode *inode, 3579 struct extent_buffer *src, 3580 struct btrfs_path *dst_path, 3581 int start_slot, 3582 int count) 3583 { 3584 struct btrfs_root *log = inode->root->log_root; 3585 char *ins_data = NULL; 3586 struct btrfs_item_batch batch; 3587 struct extent_buffer *dst; 3588 unsigned long src_offset; 3589 unsigned long dst_offset; 3590 u64 last_index; 3591 struct btrfs_key key; 3592 u32 item_size; 3593 int ret; 3594 int i; 3595 3596 ASSERT(count > 0); 3597 batch.nr = count; 3598 3599 if (count == 1) { 3600 btrfs_item_key_to_cpu(src, &key, start_slot); 3601 item_size = btrfs_item_size(src, start_slot); 3602 batch.keys = &key; 3603 batch.data_sizes = &item_size; 3604 batch.total_data_size = item_size; 3605 } else { 3606 struct btrfs_key *ins_keys; 3607 u32 *ins_sizes; 3608 3609 ins_data = kmalloc(count * sizeof(u32) + 3610 count * sizeof(struct btrfs_key), GFP_NOFS); 3611 if (!ins_data) 3612 return -ENOMEM; 3613 3614 ins_sizes = (u32 *)ins_data; 3615 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3616 batch.keys = ins_keys; 3617 batch.data_sizes = ins_sizes; 3618 batch.total_data_size = 0; 3619 3620 for (i = 0; i < count; i++) { 3621 const int slot = start_slot + i; 3622 3623 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3624 ins_sizes[i] = btrfs_item_size(src, slot); 3625 batch.total_data_size += ins_sizes[i]; 3626 } 3627 } 3628 3629 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3630 if (ret) 3631 goto out; 3632 3633 dst = dst_path->nodes[0]; 3634 /* 3635 * Copy all the items in bulk, in a single copy operation. Item data is 3636 * organized such that it's placed at the end of a leaf and from right 3637 * to left. For example, the data for the second item ends at an offset 3638 * that matches the offset where the data for the first item starts, the 3639 * data for the third item ends at an offset that matches the offset 3640 * where the data of the second items starts, and so on. 3641 * Therefore our source and destination start offsets for copy match the 3642 * offsets of the last items (highest slots). 3643 */ 3644 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3645 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3646 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3647 btrfs_release_path(dst_path); 3648 3649 last_index = batch.keys[count - 1].offset; 3650 ASSERT(last_index > inode->last_dir_index_offset); 3651 3652 /* 3653 * If for some unexpected reason the last item's index is not greater 3654 * than the last index we logged, warn and force a transaction commit. 3655 */ 3656 if (WARN_ON(last_index <= inode->last_dir_index_offset)) 3657 ret = BTRFS_LOG_FORCE_COMMIT; 3658 else 3659 inode->last_dir_index_offset = last_index; 3660 3661 if (btrfs_get_first_dir_index_to_log(inode) == 0) 3662 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); 3663 out: 3664 kfree(ins_data); 3665 3666 return ret; 3667 } 3668 3669 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx) 3670 { 3671 const int slot = path->slots[0]; 3672 3673 if (ctx->scratch_eb) { 3674 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]); 3675 } else { 3676 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]); 3677 if (!ctx->scratch_eb) 3678 return -ENOMEM; 3679 } 3680 3681 btrfs_release_path(path); 3682 path->nodes[0] = ctx->scratch_eb; 3683 path->slots[0] = slot; 3684 /* 3685 * Add extra ref to scratch eb so that it is not freed when callers 3686 * release the path, so we can reuse it later if needed. 3687 */ 3688 atomic_inc(&ctx->scratch_eb->refs); 3689 3690 return 0; 3691 } 3692 3693 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3694 struct btrfs_inode *inode, 3695 struct btrfs_path *path, 3696 struct btrfs_path *dst_path, 3697 struct btrfs_log_ctx *ctx, 3698 u64 *last_old_dentry_offset) 3699 { 3700 struct btrfs_root *log = inode->root->log_root; 3701 struct extent_buffer *src; 3702 const int nritems = btrfs_header_nritems(path->nodes[0]); 3703 const u64 ino = btrfs_ino(inode); 3704 bool last_found = false; 3705 int batch_start = 0; 3706 int batch_size = 0; 3707 int ret; 3708 3709 /* 3710 * We need to clone the leaf, release the read lock on it, and use the 3711 * clone before modifying the log tree. See the comment at copy_items() 3712 * about why we need to do this. 3713 */ 3714 ret = clone_leaf(path, ctx); 3715 if (ret < 0) 3716 return ret; 3717 3718 src = path->nodes[0]; 3719 3720 for (int i = path->slots[0]; i < nritems; i++) { 3721 struct btrfs_dir_item *di; 3722 struct btrfs_key key; 3723 int ret; 3724 3725 btrfs_item_key_to_cpu(src, &key, i); 3726 3727 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3728 last_found = true; 3729 break; 3730 } 3731 3732 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3733 3734 /* 3735 * Skip ranges of items that consist only of dir item keys created 3736 * in past transactions. However if we find a gap, we must log a 3737 * dir index range item for that gap, so that index keys in that 3738 * gap are deleted during log replay. 3739 */ 3740 if (btrfs_dir_transid(src, di) < trans->transid) { 3741 if (key.offset > *last_old_dentry_offset + 1) { 3742 ret = insert_dir_log_key(trans, log, dst_path, 3743 ino, *last_old_dentry_offset + 1, 3744 key.offset - 1); 3745 if (ret < 0) 3746 return ret; 3747 } 3748 3749 *last_old_dentry_offset = key.offset; 3750 continue; 3751 } 3752 3753 /* If we logged this dir index item before, we can skip it. */ 3754 if (key.offset <= inode->last_dir_index_offset) 3755 continue; 3756 3757 /* 3758 * We must make sure that when we log a directory entry, the 3759 * corresponding inode, after log replay, has a matching link 3760 * count. For example: 3761 * 3762 * touch foo 3763 * mkdir mydir 3764 * sync 3765 * ln foo mydir/bar 3766 * xfs_io -c "fsync" mydir 3767 * <crash> 3768 * <mount fs and log replay> 3769 * 3770 * Would result in a fsync log that when replayed, our file inode 3771 * would have a link count of 1, but we get two directory entries 3772 * pointing to the same inode. After removing one of the names, 3773 * it would not be possible to remove the other name, which 3774 * resulted always in stale file handle errors, and would not be 3775 * possible to rmdir the parent directory, since its i_size could 3776 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3777 * resulting in -ENOTEMPTY errors. 3778 */ 3779 if (!ctx->log_new_dentries) { 3780 struct btrfs_key di_key; 3781 3782 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3783 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3784 ctx->log_new_dentries = true; 3785 } 3786 3787 if (batch_size == 0) 3788 batch_start = i; 3789 batch_size++; 3790 } 3791 3792 if (batch_size > 0) { 3793 int ret; 3794 3795 ret = flush_dir_items_batch(trans, inode, src, dst_path, 3796 batch_start, batch_size); 3797 if (ret < 0) 3798 return ret; 3799 } 3800 3801 return last_found ? 1 : 0; 3802 } 3803 3804 /* 3805 * log all the items included in the current transaction for a given 3806 * directory. This also creates the range items in the log tree required 3807 * to replay anything deleted before the fsync 3808 */ 3809 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3810 struct btrfs_inode *inode, 3811 struct btrfs_path *path, 3812 struct btrfs_path *dst_path, 3813 struct btrfs_log_ctx *ctx, 3814 u64 min_offset, u64 *last_offset_ret) 3815 { 3816 struct btrfs_key min_key; 3817 struct btrfs_root *root = inode->root; 3818 struct btrfs_root *log = root->log_root; 3819 int ret; 3820 u64 last_old_dentry_offset = min_offset - 1; 3821 u64 last_offset = (u64)-1; 3822 u64 ino = btrfs_ino(inode); 3823 3824 min_key.objectid = ino; 3825 min_key.type = BTRFS_DIR_INDEX_KEY; 3826 min_key.offset = min_offset; 3827 3828 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3829 3830 /* 3831 * we didn't find anything from this transaction, see if there 3832 * is anything at all 3833 */ 3834 if (ret != 0 || min_key.objectid != ino || 3835 min_key.type != BTRFS_DIR_INDEX_KEY) { 3836 min_key.objectid = ino; 3837 min_key.type = BTRFS_DIR_INDEX_KEY; 3838 min_key.offset = (u64)-1; 3839 btrfs_release_path(path); 3840 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3841 if (ret < 0) { 3842 btrfs_release_path(path); 3843 return ret; 3844 } 3845 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3846 3847 /* if ret == 0 there are items for this type, 3848 * create a range to tell us the last key of this type. 3849 * otherwise, there are no items in this directory after 3850 * *min_offset, and we create a range to indicate that. 3851 */ 3852 if (ret == 0) { 3853 struct btrfs_key tmp; 3854 3855 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3856 path->slots[0]); 3857 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3858 last_old_dentry_offset = tmp.offset; 3859 } else if (ret > 0) { 3860 ret = 0; 3861 } 3862 3863 goto done; 3864 } 3865 3866 /* go backward to find any previous key */ 3867 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3868 if (ret == 0) { 3869 struct btrfs_key tmp; 3870 3871 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3872 /* 3873 * The dir index key before the first one we found that needs to 3874 * be logged might be in a previous leaf, and there might be a 3875 * gap between these keys, meaning that we had deletions that 3876 * happened. So the key range item we log (key type 3877 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3878 * previous key's offset plus 1, so that those deletes are replayed. 3879 */ 3880 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3881 last_old_dentry_offset = tmp.offset; 3882 } else if (ret < 0) { 3883 goto done; 3884 } 3885 3886 btrfs_release_path(path); 3887 3888 /* 3889 * Find the first key from this transaction again or the one we were at 3890 * in the loop below in case we had to reschedule. We may be logging the 3891 * directory without holding its VFS lock, which happen when logging new 3892 * dentries (through log_new_dir_dentries()) or in some cases when we 3893 * need to log the parent directory of an inode. This means a dir index 3894 * key might be deleted from the inode's root, and therefore we may not 3895 * find it anymore. If we can't find it, just move to the next key. We 3896 * can not bail out and ignore, because if we do that we will simply 3897 * not log dir index keys that come after the one that was just deleted 3898 * and we can end up logging a dir index range that ends at (u64)-1 3899 * (@last_offset is initialized to that), resulting in removing dir 3900 * entries we should not remove at log replay time. 3901 */ 3902 search: 3903 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3904 if (ret > 0) { 3905 ret = btrfs_next_item(root, path); 3906 if (ret > 0) { 3907 /* There are no more keys in the inode's root. */ 3908 ret = 0; 3909 goto done; 3910 } 3911 } 3912 if (ret < 0) 3913 goto done; 3914 3915 /* 3916 * we have a block from this transaction, log every item in it 3917 * from our directory 3918 */ 3919 while (1) { 3920 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3921 &last_old_dentry_offset); 3922 if (ret != 0) { 3923 if (ret > 0) 3924 ret = 0; 3925 goto done; 3926 } 3927 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3928 3929 /* 3930 * look ahead to the next item and see if it is also 3931 * from this directory and from this transaction 3932 */ 3933 ret = btrfs_next_leaf(root, path); 3934 if (ret) { 3935 if (ret == 1) { 3936 last_offset = (u64)-1; 3937 ret = 0; 3938 } 3939 goto done; 3940 } 3941 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3942 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3943 last_offset = (u64)-1; 3944 goto done; 3945 } 3946 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3947 /* 3948 * The next leaf was not changed in the current transaction 3949 * and has at least one dir index key. 3950 * We check for the next key because there might have been 3951 * one or more deletions between the last key we logged and 3952 * that next key. So the key range item we log (key type 3953 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3954 * offset minus 1, so that those deletes are replayed. 3955 */ 3956 last_offset = min_key.offset - 1; 3957 goto done; 3958 } 3959 if (need_resched()) { 3960 btrfs_release_path(path); 3961 cond_resched(); 3962 goto search; 3963 } 3964 } 3965 done: 3966 btrfs_release_path(path); 3967 btrfs_release_path(dst_path); 3968 3969 if (ret == 0) { 3970 *last_offset_ret = last_offset; 3971 /* 3972 * In case the leaf was changed in the current transaction but 3973 * all its dir items are from a past transaction, the last item 3974 * in the leaf is a dir item and there's no gap between that last 3975 * dir item and the first one on the next leaf (which did not 3976 * change in the current transaction), then we don't need to log 3977 * a range, last_old_dentry_offset is == to last_offset. 3978 */ 3979 ASSERT(last_old_dentry_offset <= last_offset); 3980 if (last_old_dentry_offset < last_offset) 3981 ret = insert_dir_log_key(trans, log, path, ino, 3982 last_old_dentry_offset + 1, 3983 last_offset); 3984 } 3985 3986 return ret; 3987 } 3988 3989 /* 3990 * If the inode was logged before and it was evicted, then its 3991 * last_dir_index_offset is (u64)-1, so we don't the value of the last index 3992 * key offset. If that's the case, search for it and update the inode. This 3993 * is to avoid lookups in the log tree every time we try to insert a dir index 3994 * key from a leaf changed in the current transaction, and to allow us to always 3995 * do batch insertions of dir index keys. 3996 */ 3997 static int update_last_dir_index_offset(struct btrfs_inode *inode, 3998 struct btrfs_path *path, 3999 const struct btrfs_log_ctx *ctx) 4000 { 4001 const u64 ino = btrfs_ino(inode); 4002 struct btrfs_key key; 4003 int ret; 4004 4005 lockdep_assert_held(&inode->log_mutex); 4006 4007 if (inode->last_dir_index_offset != (u64)-1) 4008 return 0; 4009 4010 if (!ctx->logged_before) { 4011 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4012 return 0; 4013 } 4014 4015 key.objectid = ino; 4016 key.type = BTRFS_DIR_INDEX_KEY; 4017 key.offset = (u64)-1; 4018 4019 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 4020 /* 4021 * An error happened or we actually have an index key with an offset 4022 * value of (u64)-1. Bail out, we're done. 4023 */ 4024 if (ret <= 0) 4025 goto out; 4026 4027 ret = 0; 4028 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4029 4030 /* 4031 * No dir index items, bail out and leave last_dir_index_offset with 4032 * the value right before the first valid index value. 4033 */ 4034 if (path->slots[0] == 0) 4035 goto out; 4036 4037 /* 4038 * btrfs_search_slot() left us at one slot beyond the slot with the last 4039 * index key, or beyond the last key of the directory that is not an 4040 * index key. If we have an index key before, set last_dir_index_offset 4041 * to its offset value, otherwise leave it with a value right before the 4042 * first valid index value, as it means we have an empty directory. 4043 */ 4044 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 4045 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) 4046 inode->last_dir_index_offset = key.offset; 4047 4048 out: 4049 btrfs_release_path(path); 4050 4051 return ret; 4052 } 4053 4054 /* 4055 * logging directories is very similar to logging inodes, We find all the items 4056 * from the current transaction and write them to the log. 4057 * 4058 * The recovery code scans the directory in the subvolume, and if it finds a 4059 * key in the range logged that is not present in the log tree, then it means 4060 * that dir entry was unlinked during the transaction. 4061 * 4062 * In order for that scan to work, we must include one key smaller than 4063 * the smallest logged by this transaction and one key larger than the largest 4064 * key logged by this transaction. 4065 */ 4066 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4067 struct btrfs_inode *inode, 4068 struct btrfs_path *path, 4069 struct btrfs_path *dst_path, 4070 struct btrfs_log_ctx *ctx) 4071 { 4072 u64 min_key; 4073 u64 max_key; 4074 int ret; 4075 4076 ret = update_last_dir_index_offset(inode, path, ctx); 4077 if (ret) 4078 return ret; 4079 4080 min_key = BTRFS_DIR_START_INDEX; 4081 max_key = 0; 4082 4083 while (1) { 4084 ret = log_dir_items(trans, inode, path, dst_path, 4085 ctx, min_key, &max_key); 4086 if (ret) 4087 return ret; 4088 if (max_key == (u64)-1) 4089 break; 4090 min_key = max_key + 1; 4091 } 4092 4093 return 0; 4094 } 4095 4096 /* 4097 * a helper function to drop items from the log before we relog an 4098 * inode. max_key_type indicates the highest item type to remove. 4099 * This cannot be run for file data extents because it does not 4100 * free the extents they point to. 4101 */ 4102 static int drop_inode_items(struct btrfs_trans_handle *trans, 4103 struct btrfs_root *log, 4104 struct btrfs_path *path, 4105 struct btrfs_inode *inode, 4106 int max_key_type) 4107 { 4108 int ret; 4109 struct btrfs_key key; 4110 struct btrfs_key found_key; 4111 int start_slot; 4112 4113 key.objectid = btrfs_ino(inode); 4114 key.type = max_key_type; 4115 key.offset = (u64)-1; 4116 4117 while (1) { 4118 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4119 if (ret < 0) { 4120 break; 4121 } else if (ret > 0) { 4122 if (path->slots[0] == 0) 4123 break; 4124 path->slots[0]--; 4125 } 4126 4127 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4128 path->slots[0]); 4129 4130 if (found_key.objectid != key.objectid) 4131 break; 4132 4133 found_key.offset = 0; 4134 found_key.type = 0; 4135 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); 4136 if (ret < 0) 4137 break; 4138 4139 ret = btrfs_del_items(trans, log, path, start_slot, 4140 path->slots[0] - start_slot + 1); 4141 /* 4142 * If start slot isn't 0 then we don't need to re-search, we've 4143 * found the last guy with the objectid in this tree. 4144 */ 4145 if (ret || start_slot != 0) 4146 break; 4147 btrfs_release_path(path); 4148 } 4149 btrfs_release_path(path); 4150 if (ret > 0) 4151 ret = 0; 4152 return ret; 4153 } 4154 4155 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4156 struct btrfs_root *log_root, 4157 struct btrfs_inode *inode, 4158 u64 new_size, u32 min_type) 4159 { 4160 struct btrfs_truncate_control control = { 4161 .new_size = new_size, 4162 .ino = btrfs_ino(inode), 4163 .min_type = min_type, 4164 .skip_ref_updates = true, 4165 }; 4166 4167 return btrfs_truncate_inode_items(trans, log_root, &control); 4168 } 4169 4170 static void fill_inode_item(struct btrfs_trans_handle *trans, 4171 struct extent_buffer *leaf, 4172 struct btrfs_inode_item *item, 4173 struct inode *inode, int log_inode_only, 4174 u64 logged_isize) 4175 { 4176 struct btrfs_map_token token; 4177 u64 flags; 4178 4179 btrfs_init_map_token(&token, leaf); 4180 4181 if (log_inode_only) { 4182 /* set the generation to zero so the recover code 4183 * can tell the difference between an logging 4184 * just to say 'this inode exists' and a logging 4185 * to say 'update this inode with these values' 4186 */ 4187 btrfs_set_token_inode_generation(&token, item, 0); 4188 btrfs_set_token_inode_size(&token, item, logged_isize); 4189 } else { 4190 btrfs_set_token_inode_generation(&token, item, 4191 BTRFS_I(inode)->generation); 4192 btrfs_set_token_inode_size(&token, item, inode->i_size); 4193 } 4194 4195 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4196 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4197 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4198 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4199 4200 btrfs_set_token_timespec_sec(&token, &item->atime, 4201 inode_get_atime_sec(inode)); 4202 btrfs_set_token_timespec_nsec(&token, &item->atime, 4203 inode_get_atime_nsec(inode)); 4204 4205 btrfs_set_token_timespec_sec(&token, &item->mtime, 4206 inode_get_mtime_sec(inode)); 4207 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4208 inode_get_mtime_nsec(inode)); 4209 4210 btrfs_set_token_timespec_sec(&token, &item->ctime, 4211 inode_get_ctime_sec(inode)); 4212 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4213 inode_get_ctime_nsec(inode)); 4214 4215 /* 4216 * We do not need to set the nbytes field, in fact during a fast fsync 4217 * its value may not even be correct, since a fast fsync does not wait 4218 * for ordered extent completion, which is where we update nbytes, it 4219 * only waits for writeback to complete. During log replay as we find 4220 * file extent items and replay them, we adjust the nbytes field of the 4221 * inode item in subvolume tree as needed (see overwrite_item()). 4222 */ 4223 4224 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4225 btrfs_set_token_inode_transid(&token, item, trans->transid); 4226 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4227 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4228 BTRFS_I(inode)->ro_flags); 4229 btrfs_set_token_inode_flags(&token, item, flags); 4230 btrfs_set_token_inode_block_group(&token, item, 0); 4231 } 4232 4233 static int log_inode_item(struct btrfs_trans_handle *trans, 4234 struct btrfs_root *log, struct btrfs_path *path, 4235 struct btrfs_inode *inode, bool inode_item_dropped) 4236 { 4237 struct btrfs_inode_item *inode_item; 4238 struct btrfs_key key; 4239 int ret; 4240 4241 btrfs_get_inode_key(inode, &key); 4242 /* 4243 * If we are doing a fast fsync and the inode was logged before in the 4244 * current transaction, then we know the inode was previously logged and 4245 * it exists in the log tree. For performance reasons, in this case use 4246 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4247 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4248 * contention in case there are concurrent fsyncs for other inodes of the 4249 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4250 * already exists can also result in unnecessarily splitting a leaf. 4251 */ 4252 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4253 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 4254 ASSERT(ret <= 0); 4255 if (ret > 0) 4256 ret = -ENOENT; 4257 } else { 4258 /* 4259 * This means it is the first fsync in the current transaction, 4260 * so the inode item is not in the log and we need to insert it. 4261 * We can never get -EEXIST because we are only called for a fast 4262 * fsync and in case an inode eviction happens after the inode was 4263 * logged before in the current transaction, when we load again 4264 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4265 * flags and set ->logged_trans to 0. 4266 */ 4267 ret = btrfs_insert_empty_item(trans, log, path, &key, 4268 sizeof(*inode_item)); 4269 ASSERT(ret != -EEXIST); 4270 } 4271 if (ret) 4272 return ret; 4273 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4274 struct btrfs_inode_item); 4275 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4276 0, 0); 4277 btrfs_release_path(path); 4278 return 0; 4279 } 4280 4281 static int log_csums(struct btrfs_trans_handle *trans, 4282 struct btrfs_inode *inode, 4283 struct btrfs_root *log_root, 4284 struct btrfs_ordered_sum *sums) 4285 { 4286 const u64 lock_end = sums->logical + sums->len - 1; 4287 struct extent_state *cached_state = NULL; 4288 int ret; 4289 4290 /* 4291 * If this inode was not used for reflink operations in the current 4292 * transaction with new extents, then do the fast path, no need to 4293 * worry about logging checksum items with overlapping ranges. 4294 */ 4295 if (inode->last_reflink_trans < trans->transid) 4296 return btrfs_csum_file_blocks(trans, log_root, sums); 4297 4298 /* 4299 * Serialize logging for checksums. This is to avoid racing with the 4300 * same checksum being logged by another task that is logging another 4301 * file which happens to refer to the same extent as well. Such races 4302 * can leave checksum items in the log with overlapping ranges. 4303 */ 4304 ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4305 &cached_state); 4306 if (ret) 4307 return ret; 4308 /* 4309 * Due to extent cloning, we might have logged a csum item that covers a 4310 * subrange of a cloned extent, and later we can end up logging a csum 4311 * item for a larger subrange of the same extent or the entire range. 4312 * This would leave csum items in the log tree that cover the same range 4313 * and break the searches for checksums in the log tree, resulting in 4314 * some checksums missing in the fs/subvolume tree. So just delete (or 4315 * trim and adjust) any existing csum items in the log for this range. 4316 */ 4317 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); 4318 if (!ret) 4319 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4320 4321 btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4322 &cached_state); 4323 4324 return ret; 4325 } 4326 4327 static noinline int copy_items(struct btrfs_trans_handle *trans, 4328 struct btrfs_inode *inode, 4329 struct btrfs_path *dst_path, 4330 struct btrfs_path *src_path, 4331 int start_slot, int nr, int inode_only, 4332 u64 logged_isize, struct btrfs_log_ctx *ctx) 4333 { 4334 struct btrfs_root *log = inode->root->log_root; 4335 struct btrfs_file_extent_item *extent; 4336 struct extent_buffer *src; 4337 int ret; 4338 struct btrfs_key *ins_keys; 4339 u32 *ins_sizes; 4340 struct btrfs_item_batch batch; 4341 char *ins_data; 4342 int dst_index; 4343 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4344 const u64 i_size = i_size_read(&inode->vfs_inode); 4345 4346 /* 4347 * To keep lockdep happy and avoid deadlocks, clone the source leaf and 4348 * use the clone. This is because otherwise we would be changing the log 4349 * tree, to insert items from the subvolume tree or insert csum items, 4350 * while holding a read lock on a leaf from the subvolume tree, which 4351 * creates a nasty lock dependency when COWing log tree nodes/leaves: 4352 * 4353 * 1) Modifying the log tree triggers an extent buffer allocation while 4354 * holding a write lock on a parent extent buffer from the log tree. 4355 * Allocating the pages for an extent buffer, or the extent buffer 4356 * struct, can trigger inode eviction and finally the inode eviction 4357 * will trigger a release/remove of a delayed node, which requires 4358 * taking the delayed node's mutex; 4359 * 4360 * 2) Allocating a metadata extent for a log tree can trigger the async 4361 * reclaim thread and make us wait for it to release enough space and 4362 * unblock our reservation ticket. The reclaim thread can start 4363 * flushing delayed items, and that in turn results in the need to 4364 * lock delayed node mutexes and in the need to write lock extent 4365 * buffers of a subvolume tree - all this while holding a write lock 4366 * on the parent extent buffer in the log tree. 4367 * 4368 * So one task in scenario 1) running in parallel with another task in 4369 * scenario 2) could lead to a deadlock, one wanting to lock a delayed 4370 * node mutex while having a read lock on a leaf from the subvolume, 4371 * while the other is holding the delayed node's mutex and wants to 4372 * write lock the same subvolume leaf for flushing delayed items. 4373 */ 4374 ret = clone_leaf(src_path, ctx); 4375 if (ret < 0) 4376 return ret; 4377 4378 src = src_path->nodes[0]; 4379 4380 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4381 nr * sizeof(u32), GFP_NOFS); 4382 if (!ins_data) 4383 return -ENOMEM; 4384 4385 ins_sizes = (u32 *)ins_data; 4386 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4387 batch.keys = ins_keys; 4388 batch.data_sizes = ins_sizes; 4389 batch.total_data_size = 0; 4390 batch.nr = 0; 4391 4392 dst_index = 0; 4393 for (int i = 0; i < nr; i++) { 4394 const int src_slot = start_slot + i; 4395 struct btrfs_root *csum_root; 4396 struct btrfs_ordered_sum *sums; 4397 struct btrfs_ordered_sum *sums_next; 4398 LIST_HEAD(ordered_sums); 4399 u64 disk_bytenr; 4400 u64 disk_num_bytes; 4401 u64 extent_offset; 4402 u64 extent_num_bytes; 4403 bool is_old_extent; 4404 4405 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4406 4407 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4408 goto add_to_batch; 4409 4410 extent = btrfs_item_ptr(src, src_slot, 4411 struct btrfs_file_extent_item); 4412 4413 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4414 trans->transid); 4415 4416 /* 4417 * Don't copy extents from past generations. That would make us 4418 * log a lot more metadata for common cases like doing only a 4419 * few random writes into a file and then fsync it for the first 4420 * time or after the full sync flag is set on the inode. We can 4421 * get leaves full of extent items, most of which are from past 4422 * generations, so we can skip them - as long as the inode has 4423 * not been the target of a reflink operation in this transaction, 4424 * as in that case it might have had file extent items with old 4425 * generations copied into it. We also must always log prealloc 4426 * extents that start at or beyond eof, otherwise we would lose 4427 * them on log replay. 4428 */ 4429 if (is_old_extent && 4430 ins_keys[dst_index].offset < i_size && 4431 inode->last_reflink_trans < trans->transid) 4432 continue; 4433 4434 if (skip_csum) 4435 goto add_to_batch; 4436 4437 /* Only regular extents have checksums. */ 4438 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4439 goto add_to_batch; 4440 4441 /* 4442 * If it's an extent created in a past transaction, then its 4443 * checksums are already accessible from the committed csum tree, 4444 * no need to log them. 4445 */ 4446 if (is_old_extent) 4447 goto add_to_batch; 4448 4449 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4450 /* If it's an explicit hole, there are no checksums. */ 4451 if (disk_bytenr == 0) 4452 goto add_to_batch; 4453 4454 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4455 4456 if (btrfs_file_extent_compression(src, extent)) { 4457 extent_offset = 0; 4458 extent_num_bytes = disk_num_bytes; 4459 } else { 4460 extent_offset = btrfs_file_extent_offset(src, extent); 4461 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4462 } 4463 4464 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4465 disk_bytenr += extent_offset; 4466 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4467 disk_bytenr + extent_num_bytes - 1, 4468 &ordered_sums, false); 4469 if (ret < 0) 4470 goto out; 4471 ret = 0; 4472 4473 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4474 if (!ret) 4475 ret = log_csums(trans, inode, log, sums); 4476 list_del(&sums->list); 4477 kfree(sums); 4478 } 4479 if (ret) 4480 goto out; 4481 4482 add_to_batch: 4483 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4484 batch.total_data_size += ins_sizes[dst_index]; 4485 batch.nr++; 4486 dst_index++; 4487 } 4488 4489 /* 4490 * We have a leaf full of old extent items that don't need to be logged, 4491 * so we don't need to do anything. 4492 */ 4493 if (batch.nr == 0) 4494 goto out; 4495 4496 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4497 if (ret) 4498 goto out; 4499 4500 dst_index = 0; 4501 for (int i = 0; i < nr; i++) { 4502 const int src_slot = start_slot + i; 4503 const int dst_slot = dst_path->slots[0] + dst_index; 4504 struct btrfs_key key; 4505 unsigned long src_offset; 4506 unsigned long dst_offset; 4507 4508 /* 4509 * We're done, all the remaining items in the source leaf 4510 * correspond to old file extent items. 4511 */ 4512 if (dst_index >= batch.nr) 4513 break; 4514 4515 btrfs_item_key_to_cpu(src, &key, src_slot); 4516 4517 if (key.type != BTRFS_EXTENT_DATA_KEY) 4518 goto copy_item; 4519 4520 extent = btrfs_item_ptr(src, src_slot, 4521 struct btrfs_file_extent_item); 4522 4523 /* See the comment in the previous loop, same logic. */ 4524 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4525 key.offset < i_size && 4526 inode->last_reflink_trans < trans->transid) 4527 continue; 4528 4529 copy_item: 4530 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4531 src_offset = btrfs_item_ptr_offset(src, src_slot); 4532 4533 if (key.type == BTRFS_INODE_ITEM_KEY) { 4534 struct btrfs_inode_item *inode_item; 4535 4536 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4537 struct btrfs_inode_item); 4538 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4539 &inode->vfs_inode, 4540 inode_only == LOG_INODE_EXISTS, 4541 logged_isize); 4542 } else { 4543 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4544 src_offset, ins_sizes[dst_index]); 4545 } 4546 4547 dst_index++; 4548 } 4549 4550 btrfs_release_path(dst_path); 4551 out: 4552 kfree(ins_data); 4553 4554 return ret; 4555 } 4556 4557 static int extent_cmp(void *priv, const struct list_head *a, 4558 const struct list_head *b) 4559 { 4560 const struct extent_map *em1, *em2; 4561 4562 em1 = list_entry(a, struct extent_map, list); 4563 em2 = list_entry(b, struct extent_map, list); 4564 4565 if (em1->start < em2->start) 4566 return -1; 4567 else if (em1->start > em2->start) 4568 return 1; 4569 return 0; 4570 } 4571 4572 static int log_extent_csums(struct btrfs_trans_handle *trans, 4573 struct btrfs_inode *inode, 4574 struct btrfs_root *log_root, 4575 const struct extent_map *em, 4576 struct btrfs_log_ctx *ctx) 4577 { 4578 struct btrfs_ordered_extent *ordered; 4579 struct btrfs_root *csum_root; 4580 u64 block_start; 4581 u64 csum_offset; 4582 u64 csum_len; 4583 u64 mod_start = em->start; 4584 u64 mod_len = em->len; 4585 LIST_HEAD(ordered_sums); 4586 int ret = 0; 4587 4588 if (inode->flags & BTRFS_INODE_NODATASUM || 4589 (em->flags & EXTENT_FLAG_PREALLOC) || 4590 em->disk_bytenr == EXTENT_MAP_HOLE) 4591 return 0; 4592 4593 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4594 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4595 const u64 mod_end = mod_start + mod_len; 4596 struct btrfs_ordered_sum *sums; 4597 4598 if (mod_len == 0) 4599 break; 4600 4601 if (ordered_end <= mod_start) 4602 continue; 4603 if (mod_end <= ordered->file_offset) 4604 break; 4605 4606 /* 4607 * We are going to copy all the csums on this ordered extent, so 4608 * go ahead and adjust mod_start and mod_len in case this ordered 4609 * extent has already been logged. 4610 */ 4611 if (ordered->file_offset > mod_start) { 4612 if (ordered_end >= mod_end) 4613 mod_len = ordered->file_offset - mod_start; 4614 /* 4615 * If we have this case 4616 * 4617 * |--------- logged extent ---------| 4618 * |----- ordered extent ----| 4619 * 4620 * Just don't mess with mod_start and mod_len, we'll 4621 * just end up logging more csums than we need and it 4622 * will be ok. 4623 */ 4624 } else { 4625 if (ordered_end < mod_end) { 4626 mod_len = mod_end - ordered_end; 4627 mod_start = ordered_end; 4628 } else { 4629 mod_len = 0; 4630 } 4631 } 4632 4633 /* 4634 * To keep us from looping for the above case of an ordered 4635 * extent that falls inside of the logged extent. 4636 */ 4637 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4638 continue; 4639 4640 list_for_each_entry(sums, &ordered->list, list) { 4641 ret = log_csums(trans, inode, log_root, sums); 4642 if (ret) 4643 return ret; 4644 } 4645 } 4646 4647 /* We're done, found all csums in the ordered extents. */ 4648 if (mod_len == 0) 4649 return 0; 4650 4651 /* If we're compressed we have to save the entire range of csums. */ 4652 if (btrfs_extent_map_is_compressed(em)) { 4653 csum_offset = 0; 4654 csum_len = em->disk_num_bytes; 4655 } else { 4656 csum_offset = mod_start - em->start; 4657 csum_len = mod_len; 4658 } 4659 4660 /* block start is already adjusted for the file extent offset. */ 4661 block_start = btrfs_extent_map_block_start(em); 4662 csum_root = btrfs_csum_root(trans->fs_info, block_start); 4663 ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, 4664 block_start + csum_offset + csum_len - 1, 4665 &ordered_sums, false); 4666 if (ret < 0) 4667 return ret; 4668 ret = 0; 4669 4670 while (!list_empty(&ordered_sums)) { 4671 struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums, 4672 struct btrfs_ordered_sum, 4673 list); 4674 if (!ret) 4675 ret = log_csums(trans, inode, log_root, sums); 4676 list_del(&sums->list); 4677 kfree(sums); 4678 } 4679 4680 return ret; 4681 } 4682 4683 static int log_one_extent(struct btrfs_trans_handle *trans, 4684 struct btrfs_inode *inode, 4685 const struct extent_map *em, 4686 struct btrfs_path *path, 4687 struct btrfs_log_ctx *ctx) 4688 { 4689 struct btrfs_drop_extents_args drop_args = { 0 }; 4690 struct btrfs_root *log = inode->root->log_root; 4691 struct btrfs_file_extent_item fi = { 0 }; 4692 struct extent_buffer *leaf; 4693 struct btrfs_key key; 4694 enum btrfs_compression_type compress_type; 4695 u64 extent_offset = em->offset; 4696 u64 block_start = btrfs_extent_map_block_start(em); 4697 u64 block_len; 4698 int ret; 4699 4700 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4701 if (em->flags & EXTENT_FLAG_PREALLOC) 4702 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4703 else 4704 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4705 4706 block_len = em->disk_num_bytes; 4707 compress_type = btrfs_extent_map_compression(em); 4708 if (compress_type != BTRFS_COMPRESS_NONE) { 4709 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); 4710 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4711 } else if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { 4712 btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start - extent_offset); 4713 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4714 } 4715 4716 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4717 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4718 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4719 btrfs_set_stack_file_extent_compression(&fi, compress_type); 4720 4721 ret = log_extent_csums(trans, inode, log, em, ctx); 4722 if (ret) 4723 return ret; 4724 4725 /* 4726 * If this is the first time we are logging the inode in the current 4727 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4728 * because it does a deletion search, which always acquires write locks 4729 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4730 * but also adds significant contention in a log tree, since log trees 4731 * are small, with a root at level 2 or 3 at most, due to their short 4732 * life span. 4733 */ 4734 if (ctx->logged_before) { 4735 drop_args.path = path; 4736 drop_args.start = em->start; 4737 drop_args.end = em->start + em->len; 4738 drop_args.replace_extent = true; 4739 drop_args.extent_item_size = sizeof(fi); 4740 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4741 if (ret) 4742 return ret; 4743 } 4744 4745 if (!drop_args.extent_inserted) { 4746 key.objectid = btrfs_ino(inode); 4747 key.type = BTRFS_EXTENT_DATA_KEY; 4748 key.offset = em->start; 4749 4750 ret = btrfs_insert_empty_item(trans, log, path, &key, 4751 sizeof(fi)); 4752 if (ret) 4753 return ret; 4754 } 4755 leaf = path->nodes[0]; 4756 write_extent_buffer(leaf, &fi, 4757 btrfs_item_ptr_offset(leaf, path->slots[0]), 4758 sizeof(fi)); 4759 4760 btrfs_release_path(path); 4761 4762 return ret; 4763 } 4764 4765 /* 4766 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4767 * lose them after doing a full/fast fsync and replaying the log. We scan the 4768 * subvolume's root instead of iterating the inode's extent map tree because 4769 * otherwise we can log incorrect extent items based on extent map conversion. 4770 * That can happen due to the fact that extent maps are merged when they 4771 * are not in the extent map tree's list of modified extents. 4772 */ 4773 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4774 struct btrfs_inode *inode, 4775 struct btrfs_path *path, 4776 struct btrfs_log_ctx *ctx) 4777 { 4778 struct btrfs_root *root = inode->root; 4779 struct btrfs_key key; 4780 const u64 i_size = i_size_read(&inode->vfs_inode); 4781 const u64 ino = btrfs_ino(inode); 4782 struct btrfs_path *dst_path = NULL; 4783 bool dropped_extents = false; 4784 u64 truncate_offset = i_size; 4785 struct extent_buffer *leaf; 4786 int slot; 4787 int ins_nr = 0; 4788 int start_slot = 0; 4789 int ret; 4790 4791 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4792 return 0; 4793 4794 key.objectid = ino; 4795 key.type = BTRFS_EXTENT_DATA_KEY; 4796 key.offset = i_size; 4797 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4798 if (ret < 0) 4799 goto out; 4800 4801 /* 4802 * We must check if there is a prealloc extent that starts before the 4803 * i_size and crosses the i_size boundary. This is to ensure later we 4804 * truncate down to the end of that extent and not to the i_size, as 4805 * otherwise we end up losing part of the prealloc extent after a log 4806 * replay and with an implicit hole if there is another prealloc extent 4807 * that starts at an offset beyond i_size. 4808 */ 4809 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4810 if (ret < 0) 4811 goto out; 4812 4813 if (ret == 0) { 4814 struct btrfs_file_extent_item *ei; 4815 4816 leaf = path->nodes[0]; 4817 slot = path->slots[0]; 4818 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4819 4820 if (btrfs_file_extent_type(leaf, ei) == 4821 BTRFS_FILE_EXTENT_PREALLOC) { 4822 u64 extent_end; 4823 4824 btrfs_item_key_to_cpu(leaf, &key, slot); 4825 extent_end = key.offset + 4826 btrfs_file_extent_num_bytes(leaf, ei); 4827 4828 if (extent_end > i_size) 4829 truncate_offset = extent_end; 4830 } 4831 } else { 4832 ret = 0; 4833 } 4834 4835 while (true) { 4836 leaf = path->nodes[0]; 4837 slot = path->slots[0]; 4838 4839 if (slot >= btrfs_header_nritems(leaf)) { 4840 if (ins_nr > 0) { 4841 ret = copy_items(trans, inode, dst_path, path, 4842 start_slot, ins_nr, 1, 0, ctx); 4843 if (ret < 0) 4844 goto out; 4845 ins_nr = 0; 4846 } 4847 ret = btrfs_next_leaf(root, path); 4848 if (ret < 0) 4849 goto out; 4850 if (ret > 0) { 4851 ret = 0; 4852 break; 4853 } 4854 continue; 4855 } 4856 4857 btrfs_item_key_to_cpu(leaf, &key, slot); 4858 if (key.objectid > ino) 4859 break; 4860 if (WARN_ON_ONCE(key.objectid < ino) || 4861 key.type < BTRFS_EXTENT_DATA_KEY || 4862 key.offset < i_size) { 4863 path->slots[0]++; 4864 continue; 4865 } 4866 /* 4867 * Avoid overlapping items in the log tree. The first time we 4868 * get here, get rid of everything from a past fsync. After 4869 * that, if the current extent starts before the end of the last 4870 * extent we copied, truncate the last one. This can happen if 4871 * an ordered extent completion modifies the subvolume tree 4872 * while btrfs_next_leaf() has the tree unlocked. 4873 */ 4874 if (!dropped_extents || key.offset < truncate_offset) { 4875 ret = truncate_inode_items(trans, root->log_root, inode, 4876 min(key.offset, truncate_offset), 4877 BTRFS_EXTENT_DATA_KEY); 4878 if (ret) 4879 goto out; 4880 dropped_extents = true; 4881 } 4882 truncate_offset = btrfs_file_extent_end(path); 4883 if (ins_nr == 0) 4884 start_slot = slot; 4885 ins_nr++; 4886 path->slots[0]++; 4887 if (!dst_path) { 4888 dst_path = btrfs_alloc_path(); 4889 if (!dst_path) { 4890 ret = -ENOMEM; 4891 goto out; 4892 } 4893 } 4894 } 4895 if (ins_nr > 0) 4896 ret = copy_items(trans, inode, dst_path, path, 4897 start_slot, ins_nr, 1, 0, ctx); 4898 out: 4899 btrfs_release_path(path); 4900 btrfs_free_path(dst_path); 4901 return ret; 4902 } 4903 4904 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4905 struct btrfs_inode *inode, 4906 struct btrfs_path *path, 4907 struct btrfs_log_ctx *ctx) 4908 { 4909 struct btrfs_ordered_extent *ordered; 4910 struct btrfs_ordered_extent *tmp; 4911 struct extent_map *em, *n; 4912 LIST_HEAD(extents); 4913 struct extent_map_tree *tree = &inode->extent_tree; 4914 int ret = 0; 4915 int num = 0; 4916 4917 write_lock(&tree->lock); 4918 4919 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4920 list_del_init(&em->list); 4921 /* 4922 * Just an arbitrary number, this can be really CPU intensive 4923 * once we start getting a lot of extents, and really once we 4924 * have a bunch of extents we just want to commit since it will 4925 * be faster. 4926 */ 4927 if (++num > 32768) { 4928 list_del_init(&tree->modified_extents); 4929 ret = -EFBIG; 4930 goto process; 4931 } 4932 4933 if (em->generation < trans->transid) 4934 continue; 4935 4936 /* We log prealloc extents beyond eof later. */ 4937 if ((em->flags & EXTENT_FLAG_PREALLOC) && 4938 em->start >= i_size_read(&inode->vfs_inode)) 4939 continue; 4940 4941 /* Need a ref to keep it from getting evicted from cache */ 4942 refcount_inc(&em->refs); 4943 em->flags |= EXTENT_FLAG_LOGGING; 4944 list_add_tail(&em->list, &extents); 4945 num++; 4946 } 4947 4948 list_sort(NULL, &extents, extent_cmp); 4949 process: 4950 while (!list_empty(&extents)) { 4951 em = list_first_entry(&extents, struct extent_map, list); 4952 4953 list_del_init(&em->list); 4954 4955 /* 4956 * If we had an error we just need to delete everybody from our 4957 * private list. 4958 */ 4959 if (ret) { 4960 btrfs_clear_em_logging(inode, em); 4961 btrfs_free_extent_map(em); 4962 continue; 4963 } 4964 4965 write_unlock(&tree->lock); 4966 4967 ret = log_one_extent(trans, inode, em, path, ctx); 4968 write_lock(&tree->lock); 4969 btrfs_clear_em_logging(inode, em); 4970 btrfs_free_extent_map(em); 4971 } 4972 WARN_ON(!list_empty(&extents)); 4973 write_unlock(&tree->lock); 4974 4975 if (!ret) 4976 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx); 4977 if (ret) 4978 return ret; 4979 4980 /* 4981 * We have logged all extents successfully, now make sure the commit of 4982 * the current transaction waits for the ordered extents to complete 4983 * before it commits and wipes out the log trees, otherwise we would 4984 * lose data if an ordered extents completes after the transaction 4985 * commits and a power failure happens after the transaction commit. 4986 */ 4987 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4988 list_del_init(&ordered->log_list); 4989 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4990 4991 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4992 spin_lock_irq(&inode->ordered_tree_lock); 4993 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4994 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4995 atomic_inc(&trans->transaction->pending_ordered); 4996 } 4997 spin_unlock_irq(&inode->ordered_tree_lock); 4998 } 4999 btrfs_put_ordered_extent(ordered); 5000 } 5001 5002 return 0; 5003 } 5004 5005 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 5006 struct btrfs_path *path, u64 *size_ret) 5007 { 5008 struct btrfs_key key; 5009 int ret; 5010 5011 key.objectid = btrfs_ino(inode); 5012 key.type = BTRFS_INODE_ITEM_KEY; 5013 key.offset = 0; 5014 5015 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5016 if (ret < 0) { 5017 return ret; 5018 } else if (ret > 0) { 5019 *size_ret = 0; 5020 } else { 5021 struct btrfs_inode_item *item; 5022 5023 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5024 struct btrfs_inode_item); 5025 *size_ret = btrfs_inode_size(path->nodes[0], item); 5026 /* 5027 * If the in-memory inode's i_size is smaller then the inode 5028 * size stored in the btree, return the inode's i_size, so 5029 * that we get a correct inode size after replaying the log 5030 * when before a power failure we had a shrinking truncate 5031 * followed by addition of a new name (rename / new hard link). 5032 * Otherwise return the inode size from the btree, to avoid 5033 * data loss when replaying a log due to previously doing a 5034 * write that expands the inode's size and logging a new name 5035 * immediately after. 5036 */ 5037 if (*size_ret > inode->vfs_inode.i_size) 5038 *size_ret = inode->vfs_inode.i_size; 5039 } 5040 5041 btrfs_release_path(path); 5042 return 0; 5043 } 5044 5045 /* 5046 * At the moment we always log all xattrs. This is to figure out at log replay 5047 * time which xattrs must have their deletion replayed. If a xattr is missing 5048 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5049 * because if a xattr is deleted, the inode is fsynced and a power failure 5050 * happens, causing the log to be replayed the next time the fs is mounted, 5051 * we want the xattr to not exist anymore (same behaviour as other filesystems 5052 * with a journal, ext3/4, xfs, f2fs, etc). 5053 */ 5054 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5055 struct btrfs_inode *inode, 5056 struct btrfs_path *path, 5057 struct btrfs_path *dst_path, 5058 struct btrfs_log_ctx *ctx) 5059 { 5060 struct btrfs_root *root = inode->root; 5061 int ret; 5062 struct btrfs_key key; 5063 const u64 ino = btrfs_ino(inode); 5064 int ins_nr = 0; 5065 int start_slot = 0; 5066 bool found_xattrs = false; 5067 5068 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5069 return 0; 5070 5071 key.objectid = ino; 5072 key.type = BTRFS_XATTR_ITEM_KEY; 5073 key.offset = 0; 5074 5075 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5076 if (ret < 0) 5077 return ret; 5078 5079 while (true) { 5080 int slot = path->slots[0]; 5081 struct extent_buffer *leaf = path->nodes[0]; 5082 int nritems = btrfs_header_nritems(leaf); 5083 5084 if (slot >= nritems) { 5085 if (ins_nr > 0) { 5086 ret = copy_items(trans, inode, dst_path, path, 5087 start_slot, ins_nr, 1, 0, ctx); 5088 if (ret < 0) 5089 return ret; 5090 ins_nr = 0; 5091 } 5092 ret = btrfs_next_leaf(root, path); 5093 if (ret < 0) 5094 return ret; 5095 else if (ret > 0) 5096 break; 5097 continue; 5098 } 5099 5100 btrfs_item_key_to_cpu(leaf, &key, slot); 5101 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5102 break; 5103 5104 if (ins_nr == 0) 5105 start_slot = slot; 5106 ins_nr++; 5107 path->slots[0]++; 5108 found_xattrs = true; 5109 cond_resched(); 5110 } 5111 if (ins_nr > 0) { 5112 ret = copy_items(trans, inode, dst_path, path, 5113 start_slot, ins_nr, 1, 0, ctx); 5114 if (ret < 0) 5115 return ret; 5116 } 5117 5118 if (!found_xattrs) 5119 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5120 5121 return 0; 5122 } 5123 5124 /* 5125 * When using the NO_HOLES feature if we punched a hole that causes the 5126 * deletion of entire leafs or all the extent items of the first leaf (the one 5127 * that contains the inode item and references) we may end up not processing 5128 * any extents, because there are no leafs with a generation matching the 5129 * current transaction that have extent items for our inode. So we need to find 5130 * if any holes exist and then log them. We also need to log holes after any 5131 * truncate operation that changes the inode's size. 5132 */ 5133 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5134 struct btrfs_inode *inode, 5135 struct btrfs_path *path) 5136 { 5137 struct btrfs_root *root = inode->root; 5138 struct btrfs_fs_info *fs_info = root->fs_info; 5139 struct btrfs_key key; 5140 const u64 ino = btrfs_ino(inode); 5141 const u64 i_size = i_size_read(&inode->vfs_inode); 5142 u64 prev_extent_end = 0; 5143 int ret; 5144 5145 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5146 return 0; 5147 5148 key.objectid = ino; 5149 key.type = BTRFS_EXTENT_DATA_KEY; 5150 key.offset = 0; 5151 5152 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5153 if (ret < 0) 5154 return ret; 5155 5156 while (true) { 5157 struct extent_buffer *leaf = path->nodes[0]; 5158 5159 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5160 ret = btrfs_next_leaf(root, path); 5161 if (ret < 0) 5162 return ret; 5163 if (ret > 0) { 5164 ret = 0; 5165 break; 5166 } 5167 leaf = path->nodes[0]; 5168 } 5169 5170 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5171 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5172 break; 5173 5174 /* We have a hole, log it. */ 5175 if (prev_extent_end < key.offset) { 5176 const u64 hole_len = key.offset - prev_extent_end; 5177 5178 /* 5179 * Release the path to avoid deadlocks with other code 5180 * paths that search the root while holding locks on 5181 * leafs from the log root. 5182 */ 5183 btrfs_release_path(path); 5184 ret = btrfs_insert_hole_extent(trans, root->log_root, 5185 ino, prev_extent_end, 5186 hole_len); 5187 if (ret < 0) 5188 return ret; 5189 5190 /* 5191 * Search for the same key again in the root. Since it's 5192 * an extent item and we are holding the inode lock, the 5193 * key must still exist. If it doesn't just emit warning 5194 * and return an error to fall back to a transaction 5195 * commit. 5196 */ 5197 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5198 if (ret < 0) 5199 return ret; 5200 if (WARN_ON(ret > 0)) 5201 return -ENOENT; 5202 leaf = path->nodes[0]; 5203 } 5204 5205 prev_extent_end = btrfs_file_extent_end(path); 5206 path->slots[0]++; 5207 cond_resched(); 5208 } 5209 5210 if (prev_extent_end < i_size) { 5211 u64 hole_len; 5212 5213 btrfs_release_path(path); 5214 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5215 ret = btrfs_insert_hole_extent(trans, root->log_root, ino, 5216 prev_extent_end, hole_len); 5217 if (ret < 0) 5218 return ret; 5219 } 5220 5221 return 0; 5222 } 5223 5224 /* 5225 * When we are logging a new inode X, check if it doesn't have a reference that 5226 * matches the reference from some other inode Y created in a past transaction 5227 * and that was renamed in the current transaction. If we don't do this, then at 5228 * log replay time we can lose inode Y (and all its files if it's a directory): 5229 * 5230 * mkdir /mnt/x 5231 * echo "hello world" > /mnt/x/foobar 5232 * sync 5233 * mv /mnt/x /mnt/y 5234 * mkdir /mnt/x # or touch /mnt/x 5235 * xfs_io -c fsync /mnt/x 5236 * <power fail> 5237 * mount fs, trigger log replay 5238 * 5239 * After the log replay procedure, we would lose the first directory and all its 5240 * files (file foobar). 5241 * For the case where inode Y is not a directory we simply end up losing it: 5242 * 5243 * echo "123" > /mnt/foo 5244 * sync 5245 * mv /mnt/foo /mnt/bar 5246 * echo "abc" > /mnt/foo 5247 * xfs_io -c fsync /mnt/foo 5248 * <power fail> 5249 * 5250 * We also need this for cases where a snapshot entry is replaced by some other 5251 * entry (file or directory) otherwise we end up with an unreplayable log due to 5252 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5253 * if it were a regular entry: 5254 * 5255 * mkdir /mnt/x 5256 * btrfs subvolume snapshot /mnt /mnt/x/snap 5257 * btrfs subvolume delete /mnt/x/snap 5258 * rmdir /mnt/x 5259 * mkdir /mnt/x 5260 * fsync /mnt/x or fsync some new file inside it 5261 * <power fail> 5262 * 5263 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5264 * the same transaction. 5265 */ 5266 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5267 const int slot, 5268 const struct btrfs_key *key, 5269 struct btrfs_inode *inode, 5270 u64 *other_ino, u64 *other_parent) 5271 { 5272 int ret; 5273 struct btrfs_path *search_path; 5274 char *name = NULL; 5275 u32 name_len = 0; 5276 u32 item_size = btrfs_item_size(eb, slot); 5277 u32 cur_offset = 0; 5278 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5279 5280 search_path = btrfs_alloc_path(); 5281 if (!search_path) 5282 return -ENOMEM; 5283 search_path->search_commit_root = 1; 5284 search_path->skip_locking = 1; 5285 5286 while (cur_offset < item_size) { 5287 u64 parent; 5288 u32 this_name_len; 5289 u32 this_len; 5290 unsigned long name_ptr; 5291 struct btrfs_dir_item *di; 5292 struct fscrypt_str name_str; 5293 5294 if (key->type == BTRFS_INODE_REF_KEY) { 5295 struct btrfs_inode_ref *iref; 5296 5297 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5298 parent = key->offset; 5299 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5300 name_ptr = (unsigned long)(iref + 1); 5301 this_len = sizeof(*iref) + this_name_len; 5302 } else { 5303 struct btrfs_inode_extref *extref; 5304 5305 extref = (struct btrfs_inode_extref *)(ptr + 5306 cur_offset); 5307 parent = btrfs_inode_extref_parent(eb, extref); 5308 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5309 name_ptr = (unsigned long)&extref->name; 5310 this_len = sizeof(*extref) + this_name_len; 5311 } 5312 5313 if (this_name_len > name_len) { 5314 char *new_name; 5315 5316 new_name = krealloc(name, this_name_len, GFP_NOFS); 5317 if (!new_name) { 5318 ret = -ENOMEM; 5319 goto out; 5320 } 5321 name_len = this_name_len; 5322 name = new_name; 5323 } 5324 5325 read_extent_buffer(eb, name, name_ptr, this_name_len); 5326 5327 name_str.name = name; 5328 name_str.len = this_name_len; 5329 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5330 parent, &name_str, 0); 5331 if (di && !IS_ERR(di)) { 5332 struct btrfs_key di_key; 5333 5334 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5335 di, &di_key); 5336 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5337 if (di_key.objectid != key->objectid) { 5338 ret = 1; 5339 *other_ino = di_key.objectid; 5340 *other_parent = parent; 5341 } else { 5342 ret = 0; 5343 } 5344 } else { 5345 ret = -EAGAIN; 5346 } 5347 goto out; 5348 } else if (IS_ERR(di)) { 5349 ret = PTR_ERR(di); 5350 goto out; 5351 } 5352 btrfs_release_path(search_path); 5353 5354 cur_offset += this_len; 5355 } 5356 ret = 0; 5357 out: 5358 btrfs_free_path(search_path); 5359 kfree(name); 5360 return ret; 5361 } 5362 5363 /* 5364 * Check if we need to log an inode. This is used in contexts where while 5365 * logging an inode we need to log another inode (either that it exists or in 5366 * full mode). This is used instead of btrfs_inode_in_log() because the later 5367 * requires the inode to be in the log and have the log transaction committed, 5368 * while here we do not care if the log transaction was already committed - our 5369 * caller will commit the log later - and we want to avoid logging an inode 5370 * multiple times when multiple tasks have joined the same log transaction. 5371 */ 5372 static bool need_log_inode(const struct btrfs_trans_handle *trans, 5373 struct btrfs_inode *inode) 5374 { 5375 /* 5376 * If a directory was not modified, no dentries added or removed, we can 5377 * and should avoid logging it. 5378 */ 5379 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5380 return false; 5381 5382 /* 5383 * If this inode does not have new/updated/deleted xattrs since the last 5384 * time it was logged and is flagged as logged in the current transaction, 5385 * we can skip logging it. As for new/deleted names, those are updated in 5386 * the log by link/unlink/rename operations. 5387 * In case the inode was logged and then evicted and reloaded, its 5388 * logged_trans will be 0, in which case we have to fully log it since 5389 * logged_trans is a transient field, not persisted. 5390 */ 5391 if (inode_logged(trans, inode, NULL) == 1 && 5392 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5393 return false; 5394 5395 return true; 5396 } 5397 5398 struct btrfs_dir_list { 5399 u64 ino; 5400 struct list_head list; 5401 }; 5402 5403 /* 5404 * Log the inodes of the new dentries of a directory. 5405 * See process_dir_items_leaf() for details about why it is needed. 5406 * This is a recursive operation - if an existing dentry corresponds to a 5407 * directory, that directory's new entries are logged too (same behaviour as 5408 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5409 * the dentries point to we do not acquire their VFS lock, otherwise lockdep 5410 * complains about the following circular lock dependency / possible deadlock: 5411 * 5412 * CPU0 CPU1 5413 * ---- ---- 5414 * lock(&type->i_mutex_dir_key#3/2); 5415 * lock(sb_internal#2); 5416 * lock(&type->i_mutex_dir_key#3/2); 5417 * lock(&sb->s_type->i_mutex_key#14); 5418 * 5419 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5420 * sb_start_intwrite() in btrfs_start_transaction(). 5421 * Not acquiring the VFS lock of the inodes is still safe because: 5422 * 5423 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5424 * that while logging the inode new references (names) are added or removed 5425 * from the inode, leaving the logged inode item with a link count that does 5426 * not match the number of logged inode reference items. This is fine because 5427 * at log replay time we compute the real number of links and correct the 5428 * link count in the inode item (see replay_one_buffer() and 5429 * link_to_fixup_dir()); 5430 * 5431 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5432 * while logging the inode's items new index items (key type 5433 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5434 * has a size that doesn't match the sum of the lengths of all the logged 5435 * names - this is ok, not a problem, because at log replay time we set the 5436 * directory's i_size to the correct value (see replay_one_name() and 5437 * overwrite_item()). 5438 */ 5439 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5440 struct btrfs_inode *start_inode, 5441 struct btrfs_log_ctx *ctx) 5442 { 5443 struct btrfs_root *root = start_inode->root; 5444 struct btrfs_path *path; 5445 LIST_HEAD(dir_list); 5446 struct btrfs_dir_list *dir_elem; 5447 u64 ino = btrfs_ino(start_inode); 5448 struct btrfs_inode *curr_inode = start_inode; 5449 int ret = 0; 5450 5451 /* 5452 * If we are logging a new name, as part of a link or rename operation, 5453 * don't bother logging new dentries, as we just want to log the names 5454 * of an inode and that any new parents exist. 5455 */ 5456 if (ctx->logging_new_name) 5457 return 0; 5458 5459 path = btrfs_alloc_path(); 5460 if (!path) 5461 return -ENOMEM; 5462 5463 /* Pairs with btrfs_add_delayed_iput below. */ 5464 ihold(&curr_inode->vfs_inode); 5465 5466 while (true) { 5467 struct btrfs_key key; 5468 struct btrfs_key found_key; 5469 u64 next_index; 5470 bool continue_curr_inode = true; 5471 int iter_ret; 5472 5473 key.objectid = ino; 5474 key.type = BTRFS_DIR_INDEX_KEY; 5475 key.offset = btrfs_get_first_dir_index_to_log(curr_inode); 5476 next_index = key.offset; 5477 again: 5478 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { 5479 struct extent_buffer *leaf = path->nodes[0]; 5480 struct btrfs_dir_item *di; 5481 struct btrfs_key di_key; 5482 struct btrfs_inode *di_inode; 5483 int log_mode = LOG_INODE_EXISTS; 5484 int type; 5485 5486 if (found_key.objectid != ino || 5487 found_key.type != BTRFS_DIR_INDEX_KEY) { 5488 continue_curr_inode = false; 5489 break; 5490 } 5491 5492 next_index = found_key.offset + 1; 5493 5494 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 5495 type = btrfs_dir_ftype(leaf, di); 5496 if (btrfs_dir_transid(leaf, di) < trans->transid) 5497 continue; 5498 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5499 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5500 continue; 5501 5502 btrfs_release_path(path); 5503 di_inode = btrfs_iget_logging(di_key.objectid, root); 5504 if (IS_ERR(di_inode)) { 5505 ret = PTR_ERR(di_inode); 5506 goto out; 5507 } 5508 5509 if (!need_log_inode(trans, di_inode)) { 5510 btrfs_add_delayed_iput(di_inode); 5511 break; 5512 } 5513 5514 ctx->log_new_dentries = false; 5515 if (type == BTRFS_FT_DIR) 5516 log_mode = LOG_INODE_ALL; 5517 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); 5518 btrfs_add_delayed_iput(di_inode); 5519 if (ret) 5520 goto out; 5521 if (ctx->log_new_dentries) { 5522 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5523 if (!dir_elem) { 5524 ret = -ENOMEM; 5525 goto out; 5526 } 5527 dir_elem->ino = di_key.objectid; 5528 list_add_tail(&dir_elem->list, &dir_list); 5529 } 5530 break; 5531 } 5532 5533 btrfs_release_path(path); 5534 5535 if (iter_ret < 0) { 5536 ret = iter_ret; 5537 goto out; 5538 } else if (iter_ret > 0) { 5539 continue_curr_inode = false; 5540 } else { 5541 key = found_key; 5542 } 5543 5544 if (continue_curr_inode && key.offset < (u64)-1) { 5545 key.offset++; 5546 goto again; 5547 } 5548 5549 btrfs_set_first_dir_index_to_log(curr_inode, next_index); 5550 5551 if (list_empty(&dir_list)) 5552 break; 5553 5554 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); 5555 ino = dir_elem->ino; 5556 list_del(&dir_elem->list); 5557 kfree(dir_elem); 5558 5559 btrfs_add_delayed_iput(curr_inode); 5560 5561 curr_inode = btrfs_iget_logging(ino, root); 5562 if (IS_ERR(curr_inode)) { 5563 ret = PTR_ERR(curr_inode); 5564 curr_inode = NULL; 5565 break; 5566 } 5567 } 5568 out: 5569 btrfs_free_path(path); 5570 if (curr_inode) 5571 btrfs_add_delayed_iput(curr_inode); 5572 5573 if (ret) { 5574 struct btrfs_dir_list *next; 5575 5576 list_for_each_entry_safe(dir_elem, next, &dir_list, list) 5577 kfree(dir_elem); 5578 } 5579 5580 return ret; 5581 } 5582 5583 struct btrfs_ino_list { 5584 u64 ino; 5585 u64 parent; 5586 struct list_head list; 5587 }; 5588 5589 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) 5590 { 5591 struct btrfs_ino_list *curr; 5592 struct btrfs_ino_list *next; 5593 5594 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { 5595 list_del(&curr->list); 5596 kfree(curr); 5597 } 5598 } 5599 5600 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, 5601 struct btrfs_path *path) 5602 { 5603 struct btrfs_key key; 5604 int ret; 5605 5606 key.objectid = ino; 5607 key.type = BTRFS_INODE_ITEM_KEY; 5608 key.offset = 0; 5609 5610 path->search_commit_root = 1; 5611 path->skip_locking = 1; 5612 5613 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5614 if (WARN_ON_ONCE(ret > 0)) { 5615 /* 5616 * We have previously found the inode through the commit root 5617 * so this should not happen. If it does, just error out and 5618 * fallback to a transaction commit. 5619 */ 5620 ret = -ENOENT; 5621 } else if (ret == 0) { 5622 struct btrfs_inode_item *item; 5623 5624 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5625 struct btrfs_inode_item); 5626 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) 5627 ret = 1; 5628 } 5629 5630 btrfs_release_path(path); 5631 path->search_commit_root = 0; 5632 path->skip_locking = 0; 5633 5634 return ret; 5635 } 5636 5637 static int add_conflicting_inode(struct btrfs_trans_handle *trans, 5638 struct btrfs_root *root, 5639 struct btrfs_path *path, 5640 u64 ino, u64 parent, 5641 struct btrfs_log_ctx *ctx) 5642 { 5643 struct btrfs_ino_list *ino_elem; 5644 struct btrfs_inode *inode; 5645 5646 /* 5647 * It's rare to have a lot of conflicting inodes, in practice it is not 5648 * common to have more than 1 or 2. We don't want to collect too many, 5649 * as we could end up logging too many inodes (even if only in 5650 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction 5651 * commits. 5652 */ 5653 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) 5654 return BTRFS_LOG_FORCE_COMMIT; 5655 5656 inode = btrfs_iget_logging(ino, root); 5657 /* 5658 * If the other inode that had a conflicting dir entry was deleted in 5659 * the current transaction then we either: 5660 * 5661 * 1) Log the parent directory (later after adding it to the list) if 5662 * the inode is a directory. This is because it may be a deleted 5663 * subvolume/snapshot or it may be a regular directory that had 5664 * deleted subvolumes/snapshots (or subdirectories that had them), 5665 * and at the moment we can't deal with dropping subvolumes/snapshots 5666 * during log replay. So we just log the parent, which will result in 5667 * a fallback to a transaction commit if we are dealing with those 5668 * cases (last_unlink_trans will match the current transaction); 5669 * 5670 * 2) Do nothing if it's not a directory. During log replay we simply 5671 * unlink the conflicting dentry from the parent directory and then 5672 * add the dentry for our inode. Like this we can avoid logging the 5673 * parent directory (and maybe fallback to a transaction commit in 5674 * case it has a last_unlink_trans == trans->transid, due to moving 5675 * some inode from it to some other directory). 5676 */ 5677 if (IS_ERR(inode)) { 5678 int ret = PTR_ERR(inode); 5679 5680 if (ret != -ENOENT) 5681 return ret; 5682 5683 ret = conflicting_inode_is_dir(root, ino, path); 5684 /* Not a directory or we got an error. */ 5685 if (ret <= 0) 5686 return ret; 5687 5688 /* Conflicting inode is a directory, so we'll log its parent. */ 5689 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5690 if (!ino_elem) 5691 return -ENOMEM; 5692 ino_elem->ino = ino; 5693 ino_elem->parent = parent; 5694 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5695 ctx->num_conflict_inodes++; 5696 5697 return 0; 5698 } 5699 5700 /* 5701 * If the inode was already logged skip it - otherwise we can hit an 5702 * infinite loop. Example: 5703 * 5704 * From the commit root (previous transaction) we have the following 5705 * inodes: 5706 * 5707 * inode 257 a directory 5708 * inode 258 with references "zz" and "zz_link" on inode 257 5709 * inode 259 with reference "a" on inode 257 5710 * 5711 * And in the current (uncommitted) transaction we have: 5712 * 5713 * inode 257 a directory, unchanged 5714 * inode 258 with references "a" and "a2" on inode 257 5715 * inode 259 with reference "zz_link" on inode 257 5716 * inode 261 with reference "zz" on inode 257 5717 * 5718 * When logging inode 261 the following infinite loop could 5719 * happen if we don't skip already logged inodes: 5720 * 5721 * - we detect inode 258 as a conflicting inode, with inode 261 5722 * on reference "zz", and log it; 5723 * 5724 * - we detect inode 259 as a conflicting inode, with inode 258 5725 * on reference "a", and log it; 5726 * 5727 * - we detect inode 258 as a conflicting inode, with inode 259 5728 * on reference "zz_link", and log it - again! After this we 5729 * repeat the above steps forever. 5730 * 5731 * Here we can use need_log_inode() because we only need to log the 5732 * inode in LOG_INODE_EXISTS mode and rename operations update the log, 5733 * so that the log ends up with the new name and without the old name. 5734 */ 5735 if (!need_log_inode(trans, inode)) { 5736 btrfs_add_delayed_iput(inode); 5737 return 0; 5738 } 5739 5740 btrfs_add_delayed_iput(inode); 5741 5742 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5743 if (!ino_elem) 5744 return -ENOMEM; 5745 ino_elem->ino = ino; 5746 ino_elem->parent = parent; 5747 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5748 ctx->num_conflict_inodes++; 5749 5750 return 0; 5751 } 5752 5753 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5754 struct btrfs_root *root, 5755 struct btrfs_log_ctx *ctx) 5756 { 5757 int ret = 0; 5758 5759 /* 5760 * Conflicting inodes are logged by the first call to btrfs_log_inode(), 5761 * otherwise we could have unbounded recursion of btrfs_log_inode() 5762 * calls. This check guarantees we can have only 1 level of recursion. 5763 */ 5764 if (ctx->logging_conflict_inodes) 5765 return 0; 5766 5767 ctx->logging_conflict_inodes = true; 5768 5769 /* 5770 * New conflicting inodes may be found and added to the list while we 5771 * are logging a conflicting inode, so keep iterating while the list is 5772 * not empty. 5773 */ 5774 while (!list_empty(&ctx->conflict_inodes)) { 5775 struct btrfs_ino_list *curr; 5776 struct btrfs_inode *inode; 5777 u64 ino; 5778 u64 parent; 5779 5780 curr = list_first_entry(&ctx->conflict_inodes, 5781 struct btrfs_ino_list, list); 5782 ino = curr->ino; 5783 parent = curr->parent; 5784 list_del(&curr->list); 5785 kfree(curr); 5786 5787 inode = btrfs_iget_logging(ino, root); 5788 /* 5789 * If the other inode that had a conflicting dir entry was 5790 * deleted in the current transaction, we need to log its parent 5791 * directory. See the comment at add_conflicting_inode(). 5792 */ 5793 if (IS_ERR(inode)) { 5794 ret = PTR_ERR(inode); 5795 if (ret != -ENOENT) 5796 break; 5797 5798 inode = btrfs_iget_logging(parent, root); 5799 if (IS_ERR(inode)) { 5800 ret = PTR_ERR(inode); 5801 break; 5802 } 5803 5804 /* 5805 * Always log the directory, we cannot make this 5806 * conditional on need_log_inode() because the directory 5807 * might have been logged in LOG_INODE_EXISTS mode or 5808 * the dir index of the conflicting inode is not in a 5809 * dir index key range logged for the directory. So we 5810 * must make sure the deletion is recorded. 5811 */ 5812 ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); 5813 btrfs_add_delayed_iput(inode); 5814 if (ret) 5815 break; 5816 continue; 5817 } 5818 5819 /* 5820 * Here we can use need_log_inode() because we only need to log 5821 * the inode in LOG_INODE_EXISTS mode and rename operations 5822 * update the log, so that the log ends up with the new name and 5823 * without the old name. 5824 * 5825 * We did this check at add_conflicting_inode(), but here we do 5826 * it again because if some other task logged the inode after 5827 * that, we can avoid doing it again. 5828 */ 5829 if (!need_log_inode(trans, inode)) { 5830 btrfs_add_delayed_iput(inode); 5831 continue; 5832 } 5833 5834 /* 5835 * We are safe logging the other inode without acquiring its 5836 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5837 * are safe against concurrent renames of the other inode as 5838 * well because during a rename we pin the log and update the 5839 * log with the new name before we unpin it. 5840 */ 5841 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); 5842 btrfs_add_delayed_iput(inode); 5843 if (ret) 5844 break; 5845 } 5846 5847 ctx->logging_conflict_inodes = false; 5848 if (ret) 5849 free_conflicting_inodes(ctx); 5850 5851 return ret; 5852 } 5853 5854 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5855 struct btrfs_inode *inode, 5856 struct btrfs_key *min_key, 5857 const struct btrfs_key *max_key, 5858 struct btrfs_path *path, 5859 struct btrfs_path *dst_path, 5860 const u64 logged_isize, 5861 const int inode_only, 5862 struct btrfs_log_ctx *ctx, 5863 bool *need_log_inode_item) 5864 { 5865 const u64 i_size = i_size_read(&inode->vfs_inode); 5866 struct btrfs_root *root = inode->root; 5867 int ins_start_slot = 0; 5868 int ins_nr = 0; 5869 int ret; 5870 5871 while (1) { 5872 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5873 if (ret < 0) 5874 return ret; 5875 if (ret > 0) { 5876 ret = 0; 5877 break; 5878 } 5879 again: 5880 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5881 if (min_key->objectid != max_key->objectid) 5882 break; 5883 if (min_key->type > max_key->type) 5884 break; 5885 5886 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5887 *need_log_inode_item = false; 5888 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5889 min_key->offset >= i_size) { 5890 /* 5891 * Extents at and beyond eof are logged with 5892 * btrfs_log_prealloc_extents(). 5893 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5894 * and no keys greater than that, so bail out. 5895 */ 5896 break; 5897 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5898 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5899 (inode->generation == trans->transid || 5900 ctx->logging_conflict_inodes)) { 5901 u64 other_ino = 0; 5902 u64 other_parent = 0; 5903 5904 ret = btrfs_check_ref_name_override(path->nodes[0], 5905 path->slots[0], min_key, inode, 5906 &other_ino, &other_parent); 5907 if (ret < 0) { 5908 return ret; 5909 } else if (ret > 0 && 5910 other_ino != btrfs_ino(ctx->inode)) { 5911 if (ins_nr > 0) { 5912 ins_nr++; 5913 } else { 5914 ins_nr = 1; 5915 ins_start_slot = path->slots[0]; 5916 } 5917 ret = copy_items(trans, inode, dst_path, path, 5918 ins_start_slot, ins_nr, 5919 inode_only, logged_isize, ctx); 5920 if (ret < 0) 5921 return ret; 5922 ins_nr = 0; 5923 5924 btrfs_release_path(path); 5925 ret = add_conflicting_inode(trans, root, path, 5926 other_ino, 5927 other_parent, ctx); 5928 if (ret) 5929 return ret; 5930 goto next_key; 5931 } 5932 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5933 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5934 if (ins_nr == 0) 5935 goto next_slot; 5936 ret = copy_items(trans, inode, dst_path, path, 5937 ins_start_slot, 5938 ins_nr, inode_only, logged_isize, ctx); 5939 if (ret < 0) 5940 return ret; 5941 ins_nr = 0; 5942 goto next_slot; 5943 } 5944 5945 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5946 ins_nr++; 5947 goto next_slot; 5948 } else if (!ins_nr) { 5949 ins_start_slot = path->slots[0]; 5950 ins_nr = 1; 5951 goto next_slot; 5952 } 5953 5954 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5955 ins_nr, inode_only, logged_isize, ctx); 5956 if (ret < 0) 5957 return ret; 5958 ins_nr = 1; 5959 ins_start_slot = path->slots[0]; 5960 next_slot: 5961 path->slots[0]++; 5962 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5963 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5964 path->slots[0]); 5965 goto again; 5966 } 5967 if (ins_nr) { 5968 ret = copy_items(trans, inode, dst_path, path, 5969 ins_start_slot, ins_nr, inode_only, 5970 logged_isize, ctx); 5971 if (ret < 0) 5972 return ret; 5973 ins_nr = 0; 5974 } 5975 btrfs_release_path(path); 5976 next_key: 5977 if (min_key->offset < (u64)-1) { 5978 min_key->offset++; 5979 } else if (min_key->type < max_key->type) { 5980 min_key->type++; 5981 min_key->offset = 0; 5982 } else { 5983 break; 5984 } 5985 5986 /* 5987 * We may process many leaves full of items for our inode, so 5988 * avoid monopolizing a cpu for too long by rescheduling while 5989 * not holding locks on any tree. 5990 */ 5991 cond_resched(); 5992 } 5993 if (ins_nr) { 5994 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5995 ins_nr, inode_only, logged_isize, ctx); 5996 if (ret) 5997 return ret; 5998 } 5999 6000 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 6001 /* 6002 * Release the path because otherwise we might attempt to double 6003 * lock the same leaf with btrfs_log_prealloc_extents() below. 6004 */ 6005 btrfs_release_path(path); 6006 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx); 6007 } 6008 6009 return ret; 6010 } 6011 6012 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, 6013 struct btrfs_root *log, 6014 struct btrfs_path *path, 6015 const struct btrfs_item_batch *batch, 6016 const struct btrfs_delayed_item *first_item) 6017 { 6018 const struct btrfs_delayed_item *curr = first_item; 6019 int ret; 6020 6021 ret = btrfs_insert_empty_items(trans, log, path, batch); 6022 if (ret) 6023 return ret; 6024 6025 for (int i = 0; i < batch->nr; i++) { 6026 char *data_ptr; 6027 6028 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 6029 write_extent_buffer(path->nodes[0], &curr->data, 6030 (unsigned long)data_ptr, curr->data_len); 6031 curr = list_next_entry(curr, log_list); 6032 path->slots[0]++; 6033 } 6034 6035 btrfs_release_path(path); 6036 6037 return 0; 6038 } 6039 6040 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, 6041 struct btrfs_inode *inode, 6042 struct btrfs_path *path, 6043 const struct list_head *delayed_ins_list, 6044 struct btrfs_log_ctx *ctx) 6045 { 6046 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ 6047 const int max_batch_size = 195; 6048 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); 6049 const u64 ino = btrfs_ino(inode); 6050 struct btrfs_root *log = inode->root->log_root; 6051 struct btrfs_item_batch batch = { 6052 .nr = 0, 6053 .total_data_size = 0, 6054 }; 6055 const struct btrfs_delayed_item *first = NULL; 6056 const struct btrfs_delayed_item *curr; 6057 char *ins_data; 6058 struct btrfs_key *ins_keys; 6059 u32 *ins_sizes; 6060 u64 curr_batch_size = 0; 6061 int batch_idx = 0; 6062 int ret; 6063 6064 /* We are adding dir index items to the log tree. */ 6065 lockdep_assert_held(&inode->log_mutex); 6066 6067 /* 6068 * We collect delayed items before copying index keys from the subvolume 6069 * to the log tree. However just after we collected them, they may have 6070 * been flushed (all of them or just some of them), and therefore we 6071 * could have copied them from the subvolume tree to the log tree. 6072 * So find the first delayed item that was not yet logged (they are 6073 * sorted by index number). 6074 */ 6075 list_for_each_entry(curr, delayed_ins_list, log_list) { 6076 if (curr->index > inode->last_dir_index_offset) { 6077 first = curr; 6078 break; 6079 } 6080 } 6081 6082 /* Empty list or all delayed items were already logged. */ 6083 if (!first) 6084 return 0; 6085 6086 ins_data = kmalloc(max_batch_size * sizeof(u32) + 6087 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); 6088 if (!ins_data) 6089 return -ENOMEM; 6090 ins_sizes = (u32 *)ins_data; 6091 batch.data_sizes = ins_sizes; 6092 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); 6093 batch.keys = ins_keys; 6094 6095 curr = first; 6096 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { 6097 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); 6098 6099 if (curr_batch_size + curr_size > leaf_data_size || 6100 batch.nr == max_batch_size) { 6101 ret = insert_delayed_items_batch(trans, log, path, 6102 &batch, first); 6103 if (ret) 6104 goto out; 6105 batch_idx = 0; 6106 batch.nr = 0; 6107 batch.total_data_size = 0; 6108 curr_batch_size = 0; 6109 first = curr; 6110 } 6111 6112 ins_sizes[batch_idx] = curr->data_len; 6113 ins_keys[batch_idx].objectid = ino; 6114 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; 6115 ins_keys[batch_idx].offset = curr->index; 6116 curr_batch_size += curr_size; 6117 batch.total_data_size += curr->data_len; 6118 batch.nr++; 6119 batch_idx++; 6120 curr = list_next_entry(curr, log_list); 6121 } 6122 6123 ASSERT(batch.nr >= 1); 6124 ret = insert_delayed_items_batch(trans, log, path, &batch, first); 6125 6126 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, 6127 log_list); 6128 inode->last_dir_index_offset = curr->index; 6129 out: 6130 kfree(ins_data); 6131 6132 return ret; 6133 } 6134 6135 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, 6136 struct btrfs_inode *inode, 6137 struct btrfs_path *path, 6138 const struct list_head *delayed_del_list, 6139 struct btrfs_log_ctx *ctx) 6140 { 6141 const u64 ino = btrfs_ino(inode); 6142 const struct btrfs_delayed_item *curr; 6143 6144 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6145 log_list); 6146 6147 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6148 u64 first_dir_index = curr->index; 6149 u64 last_dir_index; 6150 const struct btrfs_delayed_item *next; 6151 int ret; 6152 6153 /* 6154 * Find a range of consecutive dir index items to delete. Like 6155 * this we log a single dir range item spanning several contiguous 6156 * dir items instead of logging one range item per dir index item. 6157 */ 6158 next = list_next_entry(curr, log_list); 6159 while (!list_entry_is_head(next, delayed_del_list, log_list)) { 6160 if (next->index != curr->index + 1) 6161 break; 6162 curr = next; 6163 next = list_next_entry(next, log_list); 6164 } 6165 6166 last_dir_index = curr->index; 6167 ASSERT(last_dir_index >= first_dir_index); 6168 6169 ret = insert_dir_log_key(trans, inode->root->log_root, path, 6170 ino, first_dir_index, last_dir_index); 6171 if (ret) 6172 return ret; 6173 curr = list_next_entry(curr, log_list); 6174 } 6175 6176 return 0; 6177 } 6178 6179 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, 6180 struct btrfs_inode *inode, 6181 struct btrfs_path *path, 6182 const struct list_head *delayed_del_list, 6183 const struct btrfs_delayed_item *first, 6184 const struct btrfs_delayed_item **last_ret) 6185 { 6186 const struct btrfs_delayed_item *next; 6187 struct extent_buffer *leaf = path->nodes[0]; 6188 const int last_slot = btrfs_header_nritems(leaf) - 1; 6189 int slot = path->slots[0] + 1; 6190 const u64 ino = btrfs_ino(inode); 6191 6192 next = list_next_entry(first, log_list); 6193 6194 while (slot < last_slot && 6195 !list_entry_is_head(next, delayed_del_list, log_list)) { 6196 struct btrfs_key key; 6197 6198 btrfs_item_key_to_cpu(leaf, &key, slot); 6199 if (key.objectid != ino || 6200 key.type != BTRFS_DIR_INDEX_KEY || 6201 key.offset != next->index) 6202 break; 6203 6204 slot++; 6205 *last_ret = next; 6206 next = list_next_entry(next, log_list); 6207 } 6208 6209 return btrfs_del_items(trans, inode->root->log_root, path, 6210 path->slots[0], slot - path->slots[0]); 6211 } 6212 6213 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, 6214 struct btrfs_inode *inode, 6215 struct btrfs_path *path, 6216 const struct list_head *delayed_del_list, 6217 struct btrfs_log_ctx *ctx) 6218 { 6219 struct btrfs_root *log = inode->root->log_root; 6220 const struct btrfs_delayed_item *curr; 6221 u64 last_range_start = 0; 6222 u64 last_range_end = 0; 6223 struct btrfs_key key; 6224 6225 key.objectid = btrfs_ino(inode); 6226 key.type = BTRFS_DIR_INDEX_KEY; 6227 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6228 log_list); 6229 6230 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6231 const struct btrfs_delayed_item *last = curr; 6232 u64 first_dir_index = curr->index; 6233 u64 last_dir_index; 6234 bool deleted_items = false; 6235 int ret; 6236 6237 key.offset = curr->index; 6238 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 6239 if (ret < 0) { 6240 return ret; 6241 } else if (ret == 0) { 6242 ret = batch_delete_dir_index_items(trans, inode, path, 6243 delayed_del_list, curr, 6244 &last); 6245 if (ret) 6246 return ret; 6247 deleted_items = true; 6248 } 6249 6250 btrfs_release_path(path); 6251 6252 /* 6253 * If we deleted items from the leaf, it means we have a range 6254 * item logging their range, so no need to add one or update an 6255 * existing one. Otherwise we have to log a dir range item. 6256 */ 6257 if (deleted_items) 6258 goto next_batch; 6259 6260 last_dir_index = last->index; 6261 ASSERT(last_dir_index >= first_dir_index); 6262 /* 6263 * If this range starts right after where the previous one ends, 6264 * then we want to reuse the previous range item and change its 6265 * end offset to the end of this range. This is just to minimize 6266 * leaf space usage, by avoiding adding a new range item. 6267 */ 6268 if (last_range_end != 0 && first_dir_index == last_range_end + 1) 6269 first_dir_index = last_range_start; 6270 6271 ret = insert_dir_log_key(trans, log, path, key.objectid, 6272 first_dir_index, last_dir_index); 6273 if (ret) 6274 return ret; 6275 6276 last_range_start = first_dir_index; 6277 last_range_end = last_dir_index; 6278 next_batch: 6279 curr = list_next_entry(last, log_list); 6280 } 6281 6282 return 0; 6283 } 6284 6285 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, 6286 struct btrfs_inode *inode, 6287 struct btrfs_path *path, 6288 const struct list_head *delayed_del_list, 6289 struct btrfs_log_ctx *ctx) 6290 { 6291 /* 6292 * We are deleting dir index items from the log tree or adding range 6293 * items to it. 6294 */ 6295 lockdep_assert_held(&inode->log_mutex); 6296 6297 if (list_empty(delayed_del_list)) 6298 return 0; 6299 6300 if (ctx->logged_before) 6301 return log_delayed_deletions_incremental(trans, inode, path, 6302 delayed_del_list, ctx); 6303 6304 return log_delayed_deletions_full(trans, inode, path, delayed_del_list, 6305 ctx); 6306 } 6307 6308 /* 6309 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed 6310 * items instead of the subvolume tree. 6311 */ 6312 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, 6313 struct btrfs_inode *inode, 6314 const struct list_head *delayed_ins_list, 6315 struct btrfs_log_ctx *ctx) 6316 { 6317 const bool orig_log_new_dentries = ctx->log_new_dentries; 6318 struct btrfs_delayed_item *item; 6319 int ret = 0; 6320 6321 /* 6322 * No need for the log mutex, plus to avoid potential deadlocks or 6323 * lockdep annotations due to nesting of delayed inode mutexes and log 6324 * mutexes. 6325 */ 6326 lockdep_assert_not_held(&inode->log_mutex); 6327 6328 ASSERT(!ctx->logging_new_delayed_dentries); 6329 ctx->logging_new_delayed_dentries = true; 6330 6331 list_for_each_entry(item, delayed_ins_list, log_list) { 6332 struct btrfs_dir_item *dir_item; 6333 struct btrfs_inode *di_inode; 6334 struct btrfs_key key; 6335 int log_mode = LOG_INODE_EXISTS; 6336 6337 dir_item = (struct btrfs_dir_item *)item->data; 6338 btrfs_disk_key_to_cpu(&key, &dir_item->location); 6339 6340 if (key.type == BTRFS_ROOT_ITEM_KEY) 6341 continue; 6342 6343 di_inode = btrfs_iget_logging(key.objectid, inode->root); 6344 if (IS_ERR(di_inode)) { 6345 ret = PTR_ERR(di_inode); 6346 break; 6347 } 6348 6349 if (!need_log_inode(trans, di_inode)) { 6350 btrfs_add_delayed_iput(di_inode); 6351 continue; 6352 } 6353 6354 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) 6355 log_mode = LOG_INODE_ALL; 6356 6357 ctx->log_new_dentries = false; 6358 ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); 6359 6360 if (!ret && ctx->log_new_dentries) 6361 ret = log_new_dir_dentries(trans, di_inode, ctx); 6362 6363 btrfs_add_delayed_iput(di_inode); 6364 6365 if (ret) 6366 break; 6367 } 6368 6369 ctx->log_new_dentries = orig_log_new_dentries; 6370 ctx->logging_new_delayed_dentries = false; 6371 6372 return ret; 6373 } 6374 6375 /* log a single inode in the tree log. 6376 * At least one parent directory for this inode must exist in the tree 6377 * or be logged already. 6378 * 6379 * Any items from this inode changed by the current transaction are copied 6380 * to the log tree. An extra reference is taken on any extents in this 6381 * file, allowing us to avoid a whole pile of corner cases around logging 6382 * blocks that have been removed from the tree. 6383 * 6384 * See LOG_INODE_ALL and related defines for a description of what inode_only 6385 * does. 6386 * 6387 * This handles both files and directories. 6388 */ 6389 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 6390 struct btrfs_inode *inode, 6391 int inode_only, 6392 struct btrfs_log_ctx *ctx) 6393 { 6394 struct btrfs_path *path; 6395 struct btrfs_path *dst_path; 6396 struct btrfs_key min_key; 6397 struct btrfs_key max_key; 6398 struct btrfs_root *log = inode->root->log_root; 6399 int ret; 6400 bool fast_search = false; 6401 u64 ino = btrfs_ino(inode); 6402 struct extent_map_tree *em_tree = &inode->extent_tree; 6403 u64 logged_isize = 0; 6404 bool need_log_inode_item = true; 6405 bool xattrs_logged = false; 6406 bool inode_item_dropped = true; 6407 bool full_dir_logging = false; 6408 LIST_HEAD(delayed_ins_list); 6409 LIST_HEAD(delayed_del_list); 6410 6411 path = btrfs_alloc_path(); 6412 if (!path) 6413 return -ENOMEM; 6414 dst_path = btrfs_alloc_path(); 6415 if (!dst_path) { 6416 btrfs_free_path(path); 6417 return -ENOMEM; 6418 } 6419 6420 min_key.objectid = ino; 6421 min_key.type = BTRFS_INODE_ITEM_KEY; 6422 min_key.offset = 0; 6423 6424 max_key.objectid = ino; 6425 6426 6427 /* today the code can only do partial logging of directories */ 6428 if (S_ISDIR(inode->vfs_inode.i_mode) || 6429 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6430 &inode->runtime_flags) && 6431 inode_only >= LOG_INODE_EXISTS)) 6432 max_key.type = BTRFS_XATTR_ITEM_KEY; 6433 else 6434 max_key.type = (u8)-1; 6435 max_key.offset = (u64)-1; 6436 6437 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) 6438 full_dir_logging = true; 6439 6440 /* 6441 * If we are logging a directory while we are logging dentries of the 6442 * delayed items of some other inode, then we need to flush the delayed 6443 * items of this directory and not log the delayed items directly. This 6444 * is to prevent more than one level of recursion into btrfs_log_inode() 6445 * by having something like this: 6446 * 6447 * $ mkdir -p a/b/c/d/e/f/g/h/... 6448 * $ xfs_io -c "fsync" a 6449 * 6450 * Where all directories in the path did not exist before and are 6451 * created in the current transaction. 6452 * So in such a case we directly log the delayed items of the main 6453 * directory ("a") without flushing them first, while for each of its 6454 * subdirectories we flush their delayed items before logging them. 6455 * This prevents a potential unbounded recursion like this: 6456 * 6457 * btrfs_log_inode() 6458 * log_new_delayed_dentries() 6459 * btrfs_log_inode() 6460 * log_new_delayed_dentries() 6461 * btrfs_log_inode() 6462 * log_new_delayed_dentries() 6463 * (...) 6464 * 6465 * We have thresholds for the maximum number of delayed items to have in 6466 * memory, and once they are hit, the items are flushed asynchronously. 6467 * However the limit is quite high, so lets prevent deep levels of 6468 * recursion to happen by limiting the maximum depth to be 1. 6469 */ 6470 if (full_dir_logging && ctx->logging_new_delayed_dentries) { 6471 ret = btrfs_commit_inode_delayed_items(trans, inode); 6472 if (ret) 6473 goto out; 6474 } 6475 6476 mutex_lock(&inode->log_mutex); 6477 6478 /* 6479 * For symlinks, we must always log their content, which is stored in an 6480 * inline extent, otherwise we could end up with an empty symlink after 6481 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 6482 * one attempts to create an empty symlink). 6483 * We don't need to worry about flushing delalloc, because when we create 6484 * the inline extent when the symlink is created (we never have delalloc 6485 * for symlinks). 6486 */ 6487 if (S_ISLNK(inode->vfs_inode.i_mode)) 6488 inode_only = LOG_INODE_ALL; 6489 6490 /* 6491 * Before logging the inode item, cache the value returned by 6492 * inode_logged(), because after that we have the need to figure out if 6493 * the inode was previously logged in this transaction. 6494 */ 6495 ret = inode_logged(trans, inode, path); 6496 if (ret < 0) 6497 goto out_unlock; 6498 ctx->logged_before = (ret == 1); 6499 ret = 0; 6500 6501 /* 6502 * This is for cases where logging a directory could result in losing a 6503 * a file after replaying the log. For example, if we move a file from a 6504 * directory A to a directory B, then fsync directory A, we have no way 6505 * to known the file was moved from A to B, so logging just A would 6506 * result in losing the file after a log replay. 6507 */ 6508 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { 6509 ret = BTRFS_LOG_FORCE_COMMIT; 6510 goto out_unlock; 6511 } 6512 6513 /* 6514 * a brute force approach to making sure we get the most uptodate 6515 * copies of everything. 6516 */ 6517 if (S_ISDIR(inode->vfs_inode.i_mode)) { 6518 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 6519 if (ctx->logged_before) 6520 ret = drop_inode_items(trans, log, path, inode, 6521 BTRFS_XATTR_ITEM_KEY); 6522 } else { 6523 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 6524 /* 6525 * Make sure the new inode item we write to the log has 6526 * the same isize as the current one (if it exists). 6527 * This is necessary to prevent data loss after log 6528 * replay, and also to prevent doing a wrong expanding 6529 * truncate - for e.g. create file, write 4K into offset 6530 * 0, fsync, write 4K into offset 4096, add hard link, 6531 * fsync some other file (to sync log), power fail - if 6532 * we use the inode's current i_size, after log replay 6533 * we get a 8Kb file, with the last 4Kb extent as a hole 6534 * (zeroes), as if an expanding truncate happened, 6535 * instead of getting a file of 4Kb only. 6536 */ 6537 ret = logged_inode_size(log, inode, path, &logged_isize); 6538 if (ret) 6539 goto out_unlock; 6540 } 6541 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6542 &inode->runtime_flags)) { 6543 if (inode_only == LOG_INODE_EXISTS) { 6544 max_key.type = BTRFS_XATTR_ITEM_KEY; 6545 if (ctx->logged_before) 6546 ret = drop_inode_items(trans, log, path, 6547 inode, max_key.type); 6548 } else { 6549 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6550 &inode->runtime_flags); 6551 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6552 &inode->runtime_flags); 6553 if (ctx->logged_before) 6554 ret = truncate_inode_items(trans, log, 6555 inode, 0, 0); 6556 } 6557 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6558 &inode->runtime_flags) || 6559 inode_only == LOG_INODE_EXISTS) { 6560 if (inode_only == LOG_INODE_ALL) 6561 fast_search = true; 6562 max_key.type = BTRFS_XATTR_ITEM_KEY; 6563 if (ctx->logged_before) 6564 ret = drop_inode_items(trans, log, path, inode, 6565 max_key.type); 6566 } else { 6567 if (inode_only == LOG_INODE_ALL) 6568 fast_search = true; 6569 inode_item_dropped = false; 6570 goto log_extents; 6571 } 6572 6573 } 6574 if (ret) 6575 goto out_unlock; 6576 6577 /* 6578 * If we are logging a directory in full mode, collect the delayed items 6579 * before iterating the subvolume tree, so that we don't miss any new 6580 * dir index items in case they get flushed while or right after we are 6581 * iterating the subvolume tree. 6582 */ 6583 if (full_dir_logging && !ctx->logging_new_delayed_dentries) 6584 btrfs_log_get_delayed_items(inode, &delayed_ins_list, 6585 &delayed_del_list); 6586 6587 /* 6588 * If we are fsyncing a file with 0 hard links, then commit the delayed 6589 * inode because the last inode ref (or extref) item may still be in the 6590 * subvolume tree and if we log it the file will still exist after a log 6591 * replay. So commit the delayed inode to delete that last ref and we 6592 * skip logging it. 6593 */ 6594 if (inode->vfs_inode.i_nlink == 0) { 6595 ret = btrfs_commit_inode_delayed_inode(inode); 6596 if (ret) 6597 goto out_unlock; 6598 } 6599 6600 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 6601 path, dst_path, logged_isize, 6602 inode_only, ctx, 6603 &need_log_inode_item); 6604 if (ret) 6605 goto out_unlock; 6606 6607 btrfs_release_path(path); 6608 btrfs_release_path(dst_path); 6609 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); 6610 if (ret) 6611 goto out_unlock; 6612 xattrs_logged = true; 6613 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 6614 btrfs_release_path(path); 6615 btrfs_release_path(dst_path); 6616 ret = btrfs_log_holes(trans, inode, path); 6617 if (ret) 6618 goto out_unlock; 6619 } 6620 log_extents: 6621 btrfs_release_path(path); 6622 btrfs_release_path(dst_path); 6623 if (need_log_inode_item) { 6624 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 6625 if (ret) 6626 goto out_unlock; 6627 /* 6628 * If we are doing a fast fsync and the inode was logged before 6629 * in this transaction, we don't need to log the xattrs because 6630 * they were logged before. If xattrs were added, changed or 6631 * deleted since the last time we logged the inode, then we have 6632 * already logged them because the inode had the runtime flag 6633 * BTRFS_INODE_COPY_EVERYTHING set. 6634 */ 6635 if (!xattrs_logged && inode->logged_trans < trans->transid) { 6636 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx); 6637 if (ret) 6638 goto out_unlock; 6639 btrfs_release_path(path); 6640 } 6641 } 6642 if (fast_search) { 6643 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 6644 if (ret) 6645 goto out_unlock; 6646 } else if (inode_only == LOG_INODE_ALL) { 6647 struct extent_map *em, *n; 6648 6649 write_lock(&em_tree->lock); 6650 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 6651 list_del_init(&em->list); 6652 write_unlock(&em_tree->lock); 6653 } 6654 6655 if (full_dir_logging) { 6656 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 6657 if (ret) 6658 goto out_unlock; 6659 ret = log_delayed_insertion_items(trans, inode, path, 6660 &delayed_ins_list, ctx); 6661 if (ret) 6662 goto out_unlock; 6663 ret = log_delayed_deletion_items(trans, inode, path, 6664 &delayed_del_list, ctx); 6665 if (ret) 6666 goto out_unlock; 6667 } 6668 6669 spin_lock(&inode->lock); 6670 inode->logged_trans = trans->transid; 6671 /* 6672 * Don't update last_log_commit if we logged that an inode exists. 6673 * We do this for three reasons: 6674 * 6675 * 1) We might have had buffered writes to this inode that were 6676 * flushed and had their ordered extents completed in this 6677 * transaction, but we did not previously log the inode with 6678 * LOG_INODE_ALL. Later the inode was evicted and after that 6679 * it was loaded again and this LOG_INODE_EXISTS log operation 6680 * happened. We must make sure that if an explicit fsync against 6681 * the inode is performed later, it logs the new extents, an 6682 * updated inode item, etc, and syncs the log. The same logic 6683 * applies to direct IO writes instead of buffered writes. 6684 * 6685 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6686 * is logged with an i_size of 0 or whatever value was logged 6687 * before. If later the i_size of the inode is increased by a 6688 * truncate operation, the log is synced through an fsync of 6689 * some other inode and then finally an explicit fsync against 6690 * this inode is made, we must make sure this fsync logs the 6691 * inode with the new i_size, the hole between old i_size and 6692 * the new i_size, and syncs the log. 6693 * 6694 * 3) If we are logging that an ancestor inode exists as part of 6695 * logging a new name from a link or rename operation, don't update 6696 * its last_log_commit - otherwise if an explicit fsync is made 6697 * against an ancestor, the fsync considers the inode in the log 6698 * and doesn't sync the log, resulting in the ancestor missing after 6699 * a power failure unless the log was synced as part of an fsync 6700 * against any other unrelated inode. 6701 */ 6702 if (inode_only != LOG_INODE_EXISTS) 6703 inode->last_log_commit = inode->last_sub_trans; 6704 spin_unlock(&inode->lock); 6705 6706 /* 6707 * Reset the last_reflink_trans so that the next fsync does not need to 6708 * go through the slower path when logging extents and their checksums. 6709 */ 6710 if (inode_only == LOG_INODE_ALL) 6711 inode->last_reflink_trans = 0; 6712 6713 out_unlock: 6714 mutex_unlock(&inode->log_mutex); 6715 out: 6716 btrfs_free_path(path); 6717 btrfs_free_path(dst_path); 6718 6719 if (ret) 6720 free_conflicting_inodes(ctx); 6721 else 6722 ret = log_conflicting_inodes(trans, inode->root, ctx); 6723 6724 if (full_dir_logging && !ctx->logging_new_delayed_dentries) { 6725 if (!ret) 6726 ret = log_new_delayed_dentries(trans, inode, 6727 &delayed_ins_list, ctx); 6728 6729 btrfs_log_put_delayed_items(inode, &delayed_ins_list, 6730 &delayed_del_list); 6731 } 6732 6733 return ret; 6734 } 6735 6736 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6737 struct btrfs_inode *inode, 6738 struct btrfs_log_ctx *ctx) 6739 { 6740 int ret; 6741 struct btrfs_path *path; 6742 struct btrfs_key key; 6743 struct btrfs_root *root = inode->root; 6744 const u64 ino = btrfs_ino(inode); 6745 6746 path = btrfs_alloc_path(); 6747 if (!path) 6748 return -ENOMEM; 6749 path->skip_locking = 1; 6750 path->search_commit_root = 1; 6751 6752 key.objectid = ino; 6753 key.type = BTRFS_INODE_REF_KEY; 6754 key.offset = 0; 6755 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6756 if (ret < 0) 6757 goto out; 6758 6759 while (true) { 6760 struct extent_buffer *leaf = path->nodes[0]; 6761 int slot = path->slots[0]; 6762 u32 cur_offset = 0; 6763 u32 item_size; 6764 unsigned long ptr; 6765 6766 if (slot >= btrfs_header_nritems(leaf)) { 6767 ret = btrfs_next_leaf(root, path); 6768 if (ret < 0) 6769 goto out; 6770 else if (ret > 0) 6771 break; 6772 continue; 6773 } 6774 6775 btrfs_item_key_to_cpu(leaf, &key, slot); 6776 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6777 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6778 break; 6779 6780 item_size = btrfs_item_size(leaf, slot); 6781 ptr = btrfs_item_ptr_offset(leaf, slot); 6782 while (cur_offset < item_size) { 6783 struct btrfs_key inode_key; 6784 struct btrfs_inode *dir_inode; 6785 6786 inode_key.type = BTRFS_INODE_ITEM_KEY; 6787 inode_key.offset = 0; 6788 6789 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6790 struct btrfs_inode_extref *extref; 6791 6792 extref = (struct btrfs_inode_extref *) 6793 (ptr + cur_offset); 6794 inode_key.objectid = btrfs_inode_extref_parent( 6795 leaf, extref); 6796 cur_offset += sizeof(*extref); 6797 cur_offset += btrfs_inode_extref_name_len(leaf, 6798 extref); 6799 } else { 6800 inode_key.objectid = key.offset; 6801 cur_offset = item_size; 6802 } 6803 6804 dir_inode = btrfs_iget_logging(inode_key.objectid, root); 6805 /* 6806 * If the parent inode was deleted, return an error to 6807 * fallback to a transaction commit. This is to prevent 6808 * getting an inode that was moved from one parent A to 6809 * a parent B, got its former parent A deleted and then 6810 * it got fsync'ed, from existing at both parents after 6811 * a log replay (and the old parent still existing). 6812 * Example: 6813 * 6814 * mkdir /mnt/A 6815 * mkdir /mnt/B 6816 * touch /mnt/B/bar 6817 * sync 6818 * mv /mnt/B/bar /mnt/A/bar 6819 * mv -T /mnt/A /mnt/B 6820 * fsync /mnt/B/bar 6821 * <power fail> 6822 * 6823 * If we ignore the old parent B which got deleted, 6824 * after a log replay we would have file bar linked 6825 * at both parents and the old parent B would still 6826 * exist. 6827 */ 6828 if (IS_ERR(dir_inode)) { 6829 ret = PTR_ERR(dir_inode); 6830 goto out; 6831 } 6832 6833 if (!need_log_inode(trans, dir_inode)) { 6834 btrfs_add_delayed_iput(dir_inode); 6835 continue; 6836 } 6837 6838 ctx->log_new_dentries = false; 6839 ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); 6840 if (!ret && ctx->log_new_dentries) 6841 ret = log_new_dir_dentries(trans, dir_inode, ctx); 6842 btrfs_add_delayed_iput(dir_inode); 6843 if (ret) 6844 goto out; 6845 } 6846 path->slots[0]++; 6847 } 6848 ret = 0; 6849 out: 6850 btrfs_free_path(path); 6851 return ret; 6852 } 6853 6854 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6855 struct btrfs_root *root, 6856 struct btrfs_path *path, 6857 struct btrfs_log_ctx *ctx) 6858 { 6859 struct btrfs_key found_key; 6860 6861 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6862 6863 while (true) { 6864 struct extent_buffer *leaf; 6865 int slot; 6866 struct btrfs_key search_key; 6867 struct btrfs_inode *inode; 6868 u64 ino; 6869 int ret = 0; 6870 6871 btrfs_release_path(path); 6872 6873 ino = found_key.offset; 6874 6875 search_key.objectid = found_key.offset; 6876 search_key.type = BTRFS_INODE_ITEM_KEY; 6877 search_key.offset = 0; 6878 inode = btrfs_iget_logging(ino, root); 6879 if (IS_ERR(inode)) 6880 return PTR_ERR(inode); 6881 6882 if (inode->generation >= trans->transid && 6883 need_log_inode(trans, inode)) 6884 ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); 6885 btrfs_add_delayed_iput(inode); 6886 if (ret) 6887 return ret; 6888 6889 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6890 break; 6891 6892 search_key.type = BTRFS_INODE_REF_KEY; 6893 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6894 if (ret < 0) 6895 return ret; 6896 6897 leaf = path->nodes[0]; 6898 slot = path->slots[0]; 6899 if (slot >= btrfs_header_nritems(leaf)) { 6900 ret = btrfs_next_leaf(root, path); 6901 if (ret < 0) 6902 return ret; 6903 else if (ret > 0) 6904 return -ENOENT; 6905 leaf = path->nodes[0]; 6906 slot = path->slots[0]; 6907 } 6908 6909 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6910 if (found_key.objectid != search_key.objectid || 6911 found_key.type != BTRFS_INODE_REF_KEY) 6912 return -ENOENT; 6913 } 6914 return 0; 6915 } 6916 6917 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6918 struct btrfs_inode *inode, 6919 struct dentry *parent, 6920 struct btrfs_log_ctx *ctx) 6921 { 6922 struct btrfs_root *root = inode->root; 6923 struct dentry *old_parent = NULL; 6924 struct super_block *sb = inode->vfs_inode.i_sb; 6925 int ret = 0; 6926 6927 while (true) { 6928 if (!parent || d_really_is_negative(parent) || 6929 sb != parent->d_sb) 6930 break; 6931 6932 inode = BTRFS_I(d_inode(parent)); 6933 if (root != inode->root) 6934 break; 6935 6936 if (inode->generation >= trans->transid && 6937 need_log_inode(trans, inode)) { 6938 ret = btrfs_log_inode(trans, inode, 6939 LOG_INODE_EXISTS, ctx); 6940 if (ret) 6941 break; 6942 } 6943 if (IS_ROOT(parent)) 6944 break; 6945 6946 parent = dget_parent(parent); 6947 dput(old_parent); 6948 old_parent = parent; 6949 } 6950 dput(old_parent); 6951 6952 return ret; 6953 } 6954 6955 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6956 struct btrfs_inode *inode, 6957 struct dentry *parent, 6958 struct btrfs_log_ctx *ctx) 6959 { 6960 struct btrfs_root *root = inode->root; 6961 const u64 ino = btrfs_ino(inode); 6962 struct btrfs_path *path; 6963 struct btrfs_key search_key; 6964 int ret; 6965 6966 /* 6967 * For a single hard link case, go through a fast path that does not 6968 * need to iterate the fs/subvolume tree. 6969 */ 6970 if (inode->vfs_inode.i_nlink < 2) 6971 return log_new_ancestors_fast(trans, inode, parent, ctx); 6972 6973 path = btrfs_alloc_path(); 6974 if (!path) 6975 return -ENOMEM; 6976 6977 search_key.objectid = ino; 6978 search_key.type = BTRFS_INODE_REF_KEY; 6979 search_key.offset = 0; 6980 again: 6981 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6982 if (ret < 0) 6983 goto out; 6984 if (ret == 0) 6985 path->slots[0]++; 6986 6987 while (true) { 6988 struct extent_buffer *leaf = path->nodes[0]; 6989 int slot = path->slots[0]; 6990 struct btrfs_key found_key; 6991 6992 if (slot >= btrfs_header_nritems(leaf)) { 6993 ret = btrfs_next_leaf(root, path); 6994 if (ret < 0) 6995 goto out; 6996 else if (ret > 0) 6997 break; 6998 continue; 6999 } 7000 7001 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7002 if (found_key.objectid != ino || 7003 found_key.type > BTRFS_INODE_EXTREF_KEY) 7004 break; 7005 7006 /* 7007 * Don't deal with extended references because they are rare 7008 * cases and too complex to deal with (we would need to keep 7009 * track of which subitem we are processing for each item in 7010 * this loop, etc). So just return some error to fallback to 7011 * a transaction commit. 7012 */ 7013 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 7014 ret = -EMLINK; 7015 goto out; 7016 } 7017 7018 /* 7019 * Logging ancestors needs to do more searches on the fs/subvol 7020 * tree, so it releases the path as needed to avoid deadlocks. 7021 * Keep track of the last inode ref key and resume from that key 7022 * after logging all new ancestors for the current hard link. 7023 */ 7024 memcpy(&search_key, &found_key, sizeof(search_key)); 7025 7026 ret = log_new_ancestors(trans, root, path, ctx); 7027 if (ret) 7028 goto out; 7029 btrfs_release_path(path); 7030 goto again; 7031 } 7032 ret = 0; 7033 out: 7034 btrfs_free_path(path); 7035 return ret; 7036 } 7037 7038 /* 7039 * helper function around btrfs_log_inode to make sure newly created 7040 * parent directories also end up in the log. A minimal inode and backref 7041 * only logging is done of any parent directories that are older than 7042 * the last committed transaction 7043 */ 7044 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 7045 struct btrfs_inode *inode, 7046 struct dentry *parent, 7047 int inode_only, 7048 struct btrfs_log_ctx *ctx) 7049 { 7050 struct btrfs_root *root = inode->root; 7051 struct btrfs_fs_info *fs_info = root->fs_info; 7052 int ret = 0; 7053 bool log_dentries; 7054 7055 if (btrfs_test_opt(fs_info, NOTREELOG)) 7056 return BTRFS_LOG_FORCE_COMMIT; 7057 7058 if (btrfs_root_refs(&root->root_item) == 0) 7059 return BTRFS_LOG_FORCE_COMMIT; 7060 7061 /* 7062 * If we're logging an inode from a subvolume created in the current 7063 * transaction we must force a commit since the root is not persisted. 7064 */ 7065 if (btrfs_root_generation(&root->root_item) == trans->transid) 7066 return BTRFS_LOG_FORCE_COMMIT; 7067 7068 /* Skip already logged inodes and without new extents. */ 7069 if (btrfs_inode_in_log(inode, trans->transid) && 7070 list_empty(&ctx->ordered_extents)) 7071 return BTRFS_NO_LOG_SYNC; 7072 7073 ret = start_log_trans(trans, root, ctx); 7074 if (ret) 7075 return ret; 7076 7077 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 7078 if (ret) 7079 goto end_trans; 7080 7081 /* 7082 * for regular files, if its inode is already on disk, we don't 7083 * have to worry about the parents at all. This is because 7084 * we can use the last_unlink_trans field to record renames 7085 * and other fun in this file. 7086 */ 7087 if (S_ISREG(inode->vfs_inode.i_mode) && 7088 inode->generation < trans->transid && 7089 inode->last_unlink_trans < trans->transid) { 7090 ret = 0; 7091 goto end_trans; 7092 } 7093 7094 /* 7095 * Track if we need to log dentries because ctx->log_new_dentries can 7096 * be modified in the call chains below. 7097 */ 7098 log_dentries = ctx->log_new_dentries; 7099 7100 /* 7101 * On unlink we must make sure all our current and old parent directory 7102 * inodes are fully logged. This is to prevent leaving dangling 7103 * directory index entries in directories that were our parents but are 7104 * not anymore. Not doing this results in old parent directory being 7105 * impossible to delete after log replay (rmdir will always fail with 7106 * error -ENOTEMPTY). 7107 * 7108 * Example 1: 7109 * 7110 * mkdir testdir 7111 * touch testdir/foo 7112 * ln testdir/foo testdir/bar 7113 * sync 7114 * unlink testdir/bar 7115 * xfs_io -c fsync testdir/foo 7116 * <power failure> 7117 * mount fs, triggers log replay 7118 * 7119 * If we don't log the parent directory (testdir), after log replay the 7120 * directory still has an entry pointing to the file inode using the bar 7121 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 7122 * the file inode has a link count of 1. 7123 * 7124 * Example 2: 7125 * 7126 * mkdir testdir 7127 * touch foo 7128 * ln foo testdir/foo2 7129 * ln foo testdir/foo3 7130 * sync 7131 * unlink testdir/foo3 7132 * xfs_io -c fsync foo 7133 * <power failure> 7134 * mount fs, triggers log replay 7135 * 7136 * Similar as the first example, after log replay the parent directory 7137 * testdir still has an entry pointing to the inode file with name foo3 7138 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 7139 * and has a link count of 2. 7140 */ 7141 if (inode->last_unlink_trans >= trans->transid) { 7142 ret = btrfs_log_all_parents(trans, inode, ctx); 7143 if (ret) 7144 goto end_trans; 7145 } 7146 7147 ret = log_all_new_ancestors(trans, inode, parent, ctx); 7148 if (ret) 7149 goto end_trans; 7150 7151 if (log_dentries) 7152 ret = log_new_dir_dentries(trans, inode, ctx); 7153 end_trans: 7154 if (ret < 0) { 7155 btrfs_set_log_full_commit(trans); 7156 ret = BTRFS_LOG_FORCE_COMMIT; 7157 } 7158 7159 if (ret) 7160 btrfs_remove_log_ctx(root, ctx); 7161 btrfs_end_log_trans(root); 7162 7163 return ret; 7164 } 7165 7166 /* 7167 * it is not safe to log dentry if the chunk root has added new 7168 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 7169 * If this returns 1, you must commit the transaction to safely get your 7170 * data on disk. 7171 */ 7172 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 7173 struct dentry *dentry, 7174 struct btrfs_log_ctx *ctx) 7175 { 7176 struct dentry *parent = dget_parent(dentry); 7177 int ret; 7178 7179 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 7180 LOG_INODE_ALL, ctx); 7181 dput(parent); 7182 7183 return ret; 7184 } 7185 7186 /* 7187 * should be called during mount to recover any replay any log trees 7188 * from the FS 7189 */ 7190 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 7191 { 7192 int ret; 7193 struct btrfs_path *path; 7194 struct btrfs_trans_handle *trans; 7195 struct btrfs_key key; 7196 struct btrfs_key found_key; 7197 struct btrfs_root *log; 7198 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 7199 struct walk_control wc = { 7200 .process_func = process_one_buffer, 7201 .stage = LOG_WALK_PIN_ONLY, 7202 }; 7203 7204 path = btrfs_alloc_path(); 7205 if (!path) 7206 return -ENOMEM; 7207 7208 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7209 7210 trans = btrfs_start_transaction(fs_info->tree_root, 0); 7211 if (IS_ERR(trans)) { 7212 ret = PTR_ERR(trans); 7213 goto error; 7214 } 7215 7216 wc.trans = trans; 7217 wc.pin = 1; 7218 7219 ret = walk_log_tree(trans, log_root_tree, &wc); 7220 if (ret) { 7221 btrfs_abort_transaction(trans, ret); 7222 goto error; 7223 } 7224 7225 again: 7226 key.objectid = BTRFS_TREE_LOG_OBJECTID; 7227 key.type = BTRFS_ROOT_ITEM_KEY; 7228 key.offset = (u64)-1; 7229 7230 while (1) { 7231 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 7232 7233 if (ret < 0) { 7234 btrfs_abort_transaction(trans, ret); 7235 goto error; 7236 } 7237 if (ret > 0) { 7238 if (path->slots[0] == 0) 7239 break; 7240 path->slots[0]--; 7241 } 7242 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 7243 path->slots[0]); 7244 btrfs_release_path(path); 7245 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 7246 break; 7247 7248 log = btrfs_read_tree_root(log_root_tree, &found_key); 7249 if (IS_ERR(log)) { 7250 ret = PTR_ERR(log); 7251 btrfs_abort_transaction(trans, ret); 7252 goto error; 7253 } 7254 7255 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 7256 true); 7257 if (IS_ERR(wc.replay_dest)) { 7258 ret = PTR_ERR(wc.replay_dest); 7259 7260 /* 7261 * We didn't find the subvol, likely because it was 7262 * deleted. This is ok, simply skip this log and go to 7263 * the next one. 7264 * 7265 * We need to exclude the root because we can't have 7266 * other log replays overwriting this log as we'll read 7267 * it back in a few more times. This will keep our 7268 * block from being modified, and we'll just bail for 7269 * each subsequent pass. 7270 */ 7271 if (ret == -ENOENT) 7272 ret = btrfs_pin_extent_for_log_replay(trans, log->node); 7273 btrfs_put_root(log); 7274 7275 if (!ret) 7276 goto next; 7277 btrfs_abort_transaction(trans, ret); 7278 goto error; 7279 } 7280 7281 wc.replay_dest->log_root = log; 7282 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 7283 if (ret) 7284 /* The loop needs to continue due to the root refs */ 7285 btrfs_abort_transaction(trans, ret); 7286 else 7287 ret = walk_log_tree(trans, log, &wc); 7288 7289 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7290 ret = fixup_inode_link_counts(trans, wc.replay_dest, 7291 path); 7292 if (ret) 7293 btrfs_abort_transaction(trans, ret); 7294 } 7295 7296 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7297 struct btrfs_root *root = wc.replay_dest; 7298 7299 btrfs_release_path(path); 7300 7301 /* 7302 * We have just replayed everything, and the highest 7303 * objectid of fs roots probably has changed in case 7304 * some inode_item's got replayed. 7305 * 7306 * root->objectid_mutex is not acquired as log replay 7307 * could only happen during mount. 7308 */ 7309 ret = btrfs_init_root_free_objectid(root); 7310 if (ret) 7311 btrfs_abort_transaction(trans, ret); 7312 } 7313 7314 wc.replay_dest->log_root = NULL; 7315 btrfs_put_root(wc.replay_dest); 7316 btrfs_put_root(log); 7317 7318 if (ret) 7319 goto error; 7320 next: 7321 if (found_key.offset == 0) 7322 break; 7323 key.offset = found_key.offset - 1; 7324 } 7325 btrfs_release_path(path); 7326 7327 /* step one is to pin it all, step two is to replay just inodes */ 7328 if (wc.pin) { 7329 wc.pin = 0; 7330 wc.process_func = replay_one_buffer; 7331 wc.stage = LOG_WALK_REPLAY_INODES; 7332 goto again; 7333 } 7334 /* step three is to replay everything */ 7335 if (wc.stage < LOG_WALK_REPLAY_ALL) { 7336 wc.stage++; 7337 goto again; 7338 } 7339 7340 btrfs_free_path(path); 7341 7342 /* step 4: commit the transaction, which also unpins the blocks */ 7343 ret = btrfs_commit_transaction(trans); 7344 if (ret) 7345 return ret; 7346 7347 log_root_tree->log_root = NULL; 7348 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7349 btrfs_put_root(log_root_tree); 7350 7351 return 0; 7352 error: 7353 if (wc.trans) 7354 btrfs_end_transaction(wc.trans); 7355 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7356 btrfs_free_path(path); 7357 return ret; 7358 } 7359 7360 /* 7361 * there are some corner cases where we want to force a full 7362 * commit instead of allowing a directory to be logged. 7363 * 7364 * They revolve around files there were unlinked from the directory, and 7365 * this function updates the parent directory so that a full commit is 7366 * properly done if it is fsync'd later after the unlinks are done. 7367 * 7368 * Must be called before the unlink operations (updates to the subvolume tree, 7369 * inodes, etc) are done. 7370 */ 7371 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 7372 struct btrfs_inode *dir, struct btrfs_inode *inode, 7373 bool for_rename) 7374 { 7375 /* 7376 * when we're logging a file, if it hasn't been renamed 7377 * or unlinked, and its inode is fully committed on disk, 7378 * we don't have to worry about walking up the directory chain 7379 * to log its parents. 7380 * 7381 * So, we use the last_unlink_trans field to put this transid 7382 * into the file. When the file is logged we check it and 7383 * don't log the parents if the file is fully on disk. 7384 */ 7385 mutex_lock(&inode->log_mutex); 7386 inode->last_unlink_trans = trans->transid; 7387 mutex_unlock(&inode->log_mutex); 7388 7389 if (!for_rename) 7390 return; 7391 7392 /* 7393 * If this directory was already logged, any new names will be logged 7394 * with btrfs_log_new_name() and old names will be deleted from the log 7395 * tree with btrfs_del_dir_entries_in_log() or with 7396 * btrfs_del_inode_ref_in_log(). 7397 */ 7398 if (inode_logged(trans, dir, NULL) == 1) 7399 return; 7400 7401 /* 7402 * If the inode we're about to unlink was logged before, the log will be 7403 * properly updated with the new name with btrfs_log_new_name() and the 7404 * old name removed with btrfs_del_dir_entries_in_log() or with 7405 * btrfs_del_inode_ref_in_log(). 7406 */ 7407 if (inode_logged(trans, inode, NULL) == 1) 7408 return; 7409 7410 /* 7411 * when renaming files across directories, if the directory 7412 * there we're unlinking from gets fsync'd later on, there's 7413 * no way to find the destination directory later and fsync it 7414 * properly. So, we have to be conservative and force commits 7415 * so the new name gets discovered. 7416 */ 7417 mutex_lock(&dir->log_mutex); 7418 dir->last_unlink_trans = trans->transid; 7419 mutex_unlock(&dir->log_mutex); 7420 } 7421 7422 /* 7423 * Make sure that if someone attempts to fsync the parent directory of a deleted 7424 * snapshot, it ends up triggering a transaction commit. This is to guarantee 7425 * that after replaying the log tree of the parent directory's root we will not 7426 * see the snapshot anymore and at log replay time we will not see any log tree 7427 * corresponding to the deleted snapshot's root, which could lead to replaying 7428 * it after replaying the log tree of the parent directory (which would replay 7429 * the snapshot delete operation). 7430 * 7431 * Must be called before the actual snapshot destroy operation (updates to the 7432 * parent root and tree of tree roots trees, etc) are done. 7433 */ 7434 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 7435 struct btrfs_inode *dir) 7436 { 7437 mutex_lock(&dir->log_mutex); 7438 dir->last_unlink_trans = trans->transid; 7439 mutex_unlock(&dir->log_mutex); 7440 } 7441 7442 /* 7443 * Call this when creating a subvolume in a directory. 7444 * Because we don't commit a transaction when creating a subvolume, we can't 7445 * allow the directory pointing to the subvolume to be logged with an entry that 7446 * points to an unpersisted root if we are still in the transaction used to 7447 * create the subvolume, so make any attempt to log the directory to result in a 7448 * full log sync. 7449 * Also we don't need to worry with renames, since btrfs_rename() marks the log 7450 * for full commit when renaming a subvolume. 7451 */ 7452 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, 7453 struct btrfs_inode *dir) 7454 { 7455 mutex_lock(&dir->log_mutex); 7456 dir->last_unlink_trans = trans->transid; 7457 mutex_unlock(&dir->log_mutex); 7458 } 7459 7460 /* 7461 * Update the log after adding a new name for an inode. 7462 * 7463 * @trans: Transaction handle. 7464 * @old_dentry: The dentry associated with the old name and the old 7465 * parent directory. 7466 * @old_dir: The inode of the previous parent directory for the case 7467 * of a rename. For a link operation, it must be NULL. 7468 * @old_dir_index: The index number associated with the old name, meaningful 7469 * only for rename operations (when @old_dir is not NULL). 7470 * Ignored for link operations. 7471 * @parent: The dentry associated with the directory under which the 7472 * new name is located. 7473 * 7474 * Call this after adding a new name for an inode, as a result of a link or 7475 * rename operation, and it will properly update the log to reflect the new name. 7476 */ 7477 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 7478 struct dentry *old_dentry, struct btrfs_inode *old_dir, 7479 u64 old_dir_index, struct dentry *parent) 7480 { 7481 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 7482 struct btrfs_root *root = inode->root; 7483 struct btrfs_log_ctx ctx; 7484 bool log_pinned = false; 7485 int ret; 7486 7487 /* 7488 * this will force the logging code to walk the dentry chain 7489 * up for the file 7490 */ 7491 if (!S_ISDIR(inode->vfs_inode.i_mode)) 7492 inode->last_unlink_trans = trans->transid; 7493 7494 /* 7495 * if this inode hasn't been logged and directory we're renaming it 7496 * from hasn't been logged, we don't need to log it 7497 */ 7498 ret = inode_logged(trans, inode, NULL); 7499 if (ret < 0) { 7500 goto out; 7501 } else if (ret == 0) { 7502 if (!old_dir) 7503 return; 7504 /* 7505 * If the inode was not logged and we are doing a rename (old_dir is not 7506 * NULL), check if old_dir was logged - if it was not we can return and 7507 * do nothing. 7508 */ 7509 ret = inode_logged(trans, old_dir, NULL); 7510 if (ret < 0) 7511 goto out; 7512 else if (ret == 0) 7513 return; 7514 } 7515 ret = 0; 7516 7517 /* 7518 * If we are doing a rename (old_dir is not NULL) from a directory that 7519 * was previously logged, make sure that on log replay we get the old 7520 * dir entry deleted. This is needed because we will also log the new 7521 * name of the renamed inode, so we need to make sure that after log 7522 * replay we don't end up with both the new and old dir entries existing. 7523 */ 7524 if (old_dir && old_dir->logged_trans == trans->transid) { 7525 struct btrfs_root *log = old_dir->root->log_root; 7526 struct btrfs_path *path; 7527 struct fscrypt_name fname; 7528 7529 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7530 7531 ret = fscrypt_setup_filename(&old_dir->vfs_inode, 7532 &old_dentry->d_name, 0, &fname); 7533 if (ret) 7534 goto out; 7535 /* 7536 * We have two inodes to update in the log, the old directory and 7537 * the inode that got renamed, so we must pin the log to prevent 7538 * anyone from syncing the log until we have updated both inodes 7539 * in the log. 7540 */ 7541 ret = join_running_log_trans(root); 7542 /* 7543 * At least one of the inodes was logged before, so this should 7544 * not fail, but if it does, it's not serious, just bail out and 7545 * mark the log for a full commit. 7546 */ 7547 if (WARN_ON_ONCE(ret < 0)) { 7548 fscrypt_free_filename(&fname); 7549 goto out; 7550 } 7551 7552 log_pinned = true; 7553 7554 path = btrfs_alloc_path(); 7555 if (!path) { 7556 ret = -ENOMEM; 7557 fscrypt_free_filename(&fname); 7558 goto out; 7559 } 7560 7561 /* 7562 * Other concurrent task might be logging the old directory, 7563 * as it can be triggered when logging other inode that had or 7564 * still has a dentry in the old directory. We lock the old 7565 * directory's log_mutex to ensure the deletion of the old 7566 * name is persisted, because during directory logging we 7567 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7568 * the old name's dir index item is in the delayed items, so 7569 * it could be missed by an in progress directory logging. 7570 */ 7571 mutex_lock(&old_dir->log_mutex); 7572 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7573 &fname.disk_name, old_dir_index); 7574 if (ret > 0) { 7575 /* 7576 * The dentry does not exist in the log, so record its 7577 * deletion. 7578 */ 7579 btrfs_release_path(path); 7580 ret = insert_dir_log_key(trans, log, path, 7581 btrfs_ino(old_dir), 7582 old_dir_index, old_dir_index); 7583 } 7584 mutex_unlock(&old_dir->log_mutex); 7585 7586 btrfs_free_path(path); 7587 fscrypt_free_filename(&fname); 7588 if (ret < 0) 7589 goto out; 7590 } 7591 7592 btrfs_init_log_ctx(&ctx, inode); 7593 ctx.logging_new_name = true; 7594 btrfs_init_log_ctx_scratch_eb(&ctx); 7595 /* 7596 * We don't care about the return value. If we fail to log the new name 7597 * then we know the next attempt to sync the log will fallback to a full 7598 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7599 * we don't need to worry about getting a log committed that has an 7600 * inconsistent state after a rename operation. 7601 */ 7602 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7603 free_extent_buffer(ctx.scratch_eb); 7604 ASSERT(list_empty(&ctx.conflict_inodes)); 7605 out: 7606 /* 7607 * If an error happened mark the log for a full commit because it's not 7608 * consistent and up to date or we couldn't find out if one of the 7609 * inodes was logged before in this transaction. Do it before unpinning 7610 * the log, to avoid any races with someone else trying to commit it. 7611 */ 7612 if (ret < 0) 7613 btrfs_set_log_full_commit(trans); 7614 if (log_pinned) 7615 btrfs_end_log_trans(root); 7616 } 7617 7618