1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Alexander Block. All rights reserved. 4 */ 5 6 #include <linux/bsearch.h> 7 #include <linux/fs.h> 8 #include <linux/file.h> 9 #include <linux/sort.h> 10 #include <linux/mount.h> 11 #include <linux/xattr.h> 12 #include <linux/posix_acl_xattr.h> 13 #include <linux/radix-tree.h> 14 #include <linux/vmalloc.h> 15 #include <linux/string.h> 16 #include <linux/compat.h> 17 #include <linux/crc32c.h> 18 #include <linux/fsverity.h> 19 #include "send.h" 20 #include "ctree.h" 21 #include "backref.h" 22 #include "locking.h" 23 #include "disk-io.h" 24 #include "btrfs_inode.h" 25 #include "transaction.h" 26 #include "compression.h" 27 #include "print-tree.h" 28 #include "accessors.h" 29 #include "dir-item.h" 30 #include "file-item.h" 31 #include "ioctl.h" 32 #include "verity.h" 33 #include "lru_cache.h" 34 35 /* 36 * Maximum number of references an extent can have in order for us to attempt to 37 * issue clone operations instead of write operations. This currently exists to 38 * avoid hitting limitations of the backreference walking code (taking a lot of 39 * time and using too much memory for extents with large number of references). 40 */ 41 #define SEND_MAX_EXTENT_REFS 1024 42 43 /* 44 * A fs_path is a helper to dynamically build path names with unknown size. 45 * It reallocates the internal buffer on demand. 46 * It allows fast adding of path elements on the right side (normal path) and 47 * fast adding to the left side (reversed path). A reversed path can also be 48 * unreversed if needed. 49 */ 50 struct fs_path { 51 union { 52 struct { 53 char *start; 54 char *end; 55 56 char *buf; 57 unsigned short buf_len:15; 58 unsigned short reversed:1; 59 char inline_buf[]; 60 }; 61 /* 62 * Average path length does not exceed 200 bytes, we'll have 63 * better packing in the slab and higher chance to satisfy 64 * an allocation later during send. 65 */ 66 char pad[256]; 67 }; 68 }; 69 #define FS_PATH_INLINE_SIZE \ 70 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf)) 71 72 73 /* reused for each extent */ 74 struct clone_root { 75 struct btrfs_root *root; 76 u64 ino; 77 u64 offset; 78 u64 num_bytes; 79 bool found_ref; 80 }; 81 82 #define SEND_MAX_NAME_CACHE_SIZE 256 83 84 /* 85 * Limit the root_ids array of struct backref_cache_entry to 17 elements. 86 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which 87 * can be satisfied from the kmalloc-192 slab, without wasting any space. 88 * The most common case is to have a single root for cloning, which corresponds 89 * to the send root. Having the user specify more than 16 clone roots is not 90 * common, and in such rare cases we simply don't use caching if the number of 91 * cloning roots that lead down to a leaf is more than 17. 92 */ 93 #define SEND_MAX_BACKREF_CACHE_ROOTS 17 94 95 /* 96 * Max number of entries in the cache. 97 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding 98 * maple tree's internal nodes, is 24K. 99 */ 100 #define SEND_MAX_BACKREF_CACHE_SIZE 128 101 102 /* 103 * A backref cache entry maps a leaf to a list of IDs of roots from which the 104 * leaf is accessible and we can use for clone operations. 105 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on 106 * x86_64). 107 */ 108 struct backref_cache_entry { 109 struct btrfs_lru_cache_entry entry; 110 u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; 111 /* Number of valid elements in the root_ids array. */ 112 int num_roots; 113 }; 114 115 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ 116 static_assert(offsetof(struct backref_cache_entry, entry) == 0); 117 118 /* 119 * Max number of entries in the cache that stores directories that were already 120 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses 121 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but 122 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). 123 */ 124 #define SEND_MAX_DIR_CREATED_CACHE_SIZE 64 125 126 /* 127 * Max number of entries in the cache that stores directories that were already 128 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses 129 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but 130 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64). 131 */ 132 #define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64 133 134 struct send_ctx { 135 struct file *send_filp; 136 loff_t send_off; 137 char *send_buf; 138 u32 send_size; 139 u32 send_max_size; 140 /* 141 * Whether BTRFS_SEND_A_DATA attribute was already added to current 142 * command (since protocol v2, data must be the last attribute). 143 */ 144 bool put_data; 145 struct page **send_buf_pages; 146 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ 147 /* Protocol version compatibility requested */ 148 u32 proto; 149 150 struct btrfs_root *send_root; 151 struct btrfs_root *parent_root; 152 struct clone_root *clone_roots; 153 int clone_roots_cnt; 154 155 /* current state of the compare_tree call */ 156 struct btrfs_path *left_path; 157 struct btrfs_path *right_path; 158 struct btrfs_key *cmp_key; 159 160 /* 161 * Keep track of the generation of the last transaction that was used 162 * for relocating a block group. This is periodically checked in order 163 * to detect if a relocation happened since the last check, so that we 164 * don't operate on stale extent buffers for nodes (level >= 1) or on 165 * stale disk_bytenr values of file extent items. 166 */ 167 u64 last_reloc_trans; 168 169 /* 170 * infos of the currently processed inode. In case of deleted inodes, 171 * these are the values from the deleted inode. 172 */ 173 u64 cur_ino; 174 u64 cur_inode_gen; 175 u64 cur_inode_size; 176 u64 cur_inode_mode; 177 u64 cur_inode_rdev; 178 u64 cur_inode_last_extent; 179 u64 cur_inode_next_write_offset; 180 struct fs_path cur_inode_path; 181 bool cur_inode_new; 182 bool cur_inode_new_gen; 183 bool cur_inode_deleted; 184 bool ignore_cur_inode; 185 bool cur_inode_needs_verity; 186 void *verity_descriptor; 187 188 u64 send_progress; 189 190 struct list_head new_refs; 191 struct list_head deleted_refs; 192 193 struct btrfs_lru_cache name_cache; 194 195 /* 196 * The inode we are currently processing. It's not NULL only when we 197 * need to issue write commands for data extents from this inode. 198 */ 199 struct inode *cur_inode; 200 struct file_ra_state ra; 201 u64 page_cache_clear_start; 202 bool clean_page_cache; 203 204 /* 205 * We process inodes by their increasing order, so if before an 206 * incremental send we reverse the parent/child relationship of 207 * directories such that a directory with a lower inode number was 208 * the parent of a directory with a higher inode number, and the one 209 * becoming the new parent got renamed too, we can't rename/move the 210 * directory with lower inode number when we finish processing it - we 211 * must process the directory with higher inode number first, then 212 * rename/move it and then rename/move the directory with lower inode 213 * number. Example follows. 214 * 215 * Tree state when the first send was performed: 216 * 217 * . 218 * |-- a (ino 257) 219 * |-- b (ino 258) 220 * | 221 * | 222 * |-- c (ino 259) 223 * | |-- d (ino 260) 224 * | 225 * |-- c2 (ino 261) 226 * 227 * Tree state when the second (incremental) send is performed: 228 * 229 * . 230 * |-- a (ino 257) 231 * |-- b (ino 258) 232 * |-- c2 (ino 261) 233 * |-- d2 (ino 260) 234 * |-- cc (ino 259) 235 * 236 * The sequence of steps that lead to the second state was: 237 * 238 * mv /a/b/c/d /a/b/c2/d2 239 * mv /a/b/c /a/b/c2/d2/cc 240 * 241 * "c" has lower inode number, but we can't move it (2nd mv operation) 242 * before we move "d", which has higher inode number. 243 * 244 * So we just memorize which move/rename operations must be performed 245 * later when their respective parent is processed and moved/renamed. 246 */ 247 248 /* Indexed by parent directory inode number. */ 249 struct rb_root pending_dir_moves; 250 251 /* 252 * Reverse index, indexed by the inode number of a directory that 253 * is waiting for the move/rename of its immediate parent before its 254 * own move/rename can be performed. 255 */ 256 struct rb_root waiting_dir_moves; 257 258 /* 259 * A directory that is going to be rm'ed might have a child directory 260 * which is in the pending directory moves index above. In this case, 261 * the directory can only be removed after the move/rename of its child 262 * is performed. Example: 263 * 264 * Parent snapshot: 265 * 266 * . (ino 256) 267 * |-- a/ (ino 257) 268 * |-- b/ (ino 258) 269 * |-- c/ (ino 259) 270 * | |-- x/ (ino 260) 271 * | 272 * |-- y/ (ino 261) 273 * 274 * Send snapshot: 275 * 276 * . (ino 256) 277 * |-- a/ (ino 257) 278 * |-- b/ (ino 258) 279 * |-- YY/ (ino 261) 280 * |-- x/ (ino 260) 281 * 282 * Sequence of steps that lead to the send snapshot: 283 * rm -f /a/b/c/foo.txt 284 * mv /a/b/y /a/b/YY 285 * mv /a/b/c/x /a/b/YY 286 * rmdir /a/b/c 287 * 288 * When the child is processed, its move/rename is delayed until its 289 * parent is processed (as explained above), but all other operations 290 * like update utimes, chown, chgrp, etc, are performed and the paths 291 * that it uses for those operations must use the orphanized name of 292 * its parent (the directory we're going to rm later), so we need to 293 * memorize that name. 294 * 295 * Indexed by the inode number of the directory to be deleted. 296 */ 297 struct rb_root orphan_dirs; 298 299 struct rb_root rbtree_new_refs; 300 struct rb_root rbtree_deleted_refs; 301 302 struct btrfs_lru_cache backref_cache; 303 u64 backref_cache_last_reloc_trans; 304 305 struct btrfs_lru_cache dir_created_cache; 306 struct btrfs_lru_cache dir_utimes_cache; 307 }; 308 309 struct pending_dir_move { 310 struct rb_node node; 311 struct list_head list; 312 u64 parent_ino; 313 u64 ino; 314 u64 gen; 315 struct list_head update_refs; 316 }; 317 318 struct waiting_dir_move { 319 struct rb_node node; 320 u64 ino; 321 /* 322 * There might be some directory that could not be removed because it 323 * was waiting for this directory inode to be moved first. Therefore 324 * after this directory is moved, we can try to rmdir the ino rmdir_ino. 325 */ 326 u64 rmdir_ino; 327 u64 rmdir_gen; 328 bool orphanized; 329 }; 330 331 struct orphan_dir_info { 332 struct rb_node node; 333 u64 ino; 334 u64 gen; 335 u64 last_dir_index_offset; 336 u64 dir_high_seq_ino; 337 }; 338 339 struct name_cache_entry { 340 /* 341 * The key in the entry is an inode number, and the generation matches 342 * the inode's generation. 343 */ 344 struct btrfs_lru_cache_entry entry; 345 u64 parent_ino; 346 u64 parent_gen; 347 int ret; 348 int need_later_update; 349 /* Name length without NUL terminator. */ 350 int name_len; 351 /* Not NUL terminated. */ 352 char name[] __counted_by(name_len) __nonstring; 353 }; 354 355 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */ 356 static_assert(offsetof(struct name_cache_entry, entry) == 0); 357 358 #define ADVANCE 1 359 #define ADVANCE_ONLY_NEXT -1 360 361 enum btrfs_compare_tree_result { 362 BTRFS_COMPARE_TREE_NEW, 363 BTRFS_COMPARE_TREE_DELETED, 364 BTRFS_COMPARE_TREE_CHANGED, 365 BTRFS_COMPARE_TREE_SAME, 366 }; 367 368 __cold 369 static void inconsistent_snapshot_error(struct send_ctx *sctx, 370 enum btrfs_compare_tree_result result, 371 const char *what) 372 { 373 const char *result_string; 374 375 switch (result) { 376 case BTRFS_COMPARE_TREE_NEW: 377 result_string = "new"; 378 break; 379 case BTRFS_COMPARE_TREE_DELETED: 380 result_string = "deleted"; 381 break; 382 case BTRFS_COMPARE_TREE_CHANGED: 383 result_string = "updated"; 384 break; 385 case BTRFS_COMPARE_TREE_SAME: 386 DEBUG_WARN("no change between trees"); 387 result_string = "unchanged"; 388 break; 389 default: 390 DEBUG_WARN("unexpected comparison result %d", result); 391 result_string = "unexpected"; 392 } 393 394 btrfs_err(sctx->send_root->fs_info, 395 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", 396 result_string, what, sctx->cmp_key->objectid, 397 btrfs_root_id(sctx->send_root), 398 (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0)); 399 } 400 401 __maybe_unused 402 static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) 403 { 404 switch (sctx->proto) { 405 case 1: return cmd <= BTRFS_SEND_C_MAX_V1; 406 case 2: return cmd <= BTRFS_SEND_C_MAX_V2; 407 case 3: return cmd <= BTRFS_SEND_C_MAX_V3; 408 default: return false; 409 } 410 } 411 412 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 413 414 static struct waiting_dir_move * 415 get_waiting_dir_move(struct send_ctx *sctx, u64 ino); 416 417 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); 418 419 static int need_send_hole(struct send_ctx *sctx) 420 { 421 return (sctx->parent_root && !sctx->cur_inode_new && 422 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted && 423 S_ISREG(sctx->cur_inode_mode)); 424 } 425 426 static void fs_path_reset(struct fs_path *p) 427 { 428 if (p->reversed) 429 p->start = p->buf + p->buf_len - 1; 430 else 431 p->start = p->buf; 432 433 p->end = p->start; 434 *p->start = 0; 435 } 436 437 static void init_path(struct fs_path *p) 438 { 439 p->reversed = 0; 440 p->buf = p->inline_buf; 441 p->buf_len = FS_PATH_INLINE_SIZE; 442 fs_path_reset(p); 443 } 444 445 static struct fs_path *fs_path_alloc(void) 446 { 447 struct fs_path *p; 448 449 p = kmalloc(sizeof(*p), GFP_KERNEL); 450 if (!p) 451 return NULL; 452 init_path(p); 453 return p; 454 } 455 456 static struct fs_path *fs_path_alloc_reversed(void) 457 { 458 struct fs_path *p; 459 460 p = fs_path_alloc(); 461 if (!p) 462 return NULL; 463 p->reversed = 1; 464 fs_path_reset(p); 465 return p; 466 } 467 468 static void fs_path_free(struct fs_path *p) 469 { 470 if (!p) 471 return; 472 if (p->buf != p->inline_buf) 473 kfree(p->buf); 474 kfree(p); 475 } 476 477 static inline int fs_path_len(const struct fs_path *p) 478 { 479 return p->end - p->start; 480 } 481 482 static int fs_path_ensure_buf(struct fs_path *p, int len) 483 { 484 char *tmp_buf; 485 int path_len; 486 int old_buf_len; 487 488 len++; 489 490 if (p->buf_len >= len) 491 return 0; 492 493 if (WARN_ON(len > PATH_MAX)) 494 return -ENAMETOOLONG; 495 496 path_len = fs_path_len(p); 497 old_buf_len = p->buf_len; 498 499 /* 500 * Allocate to the next largest kmalloc bucket size, to let 501 * the fast path happen most of the time. 502 */ 503 len = kmalloc_size_roundup(len); 504 /* 505 * First time the inline_buf does not suffice 506 */ 507 if (p->buf == p->inline_buf) { 508 tmp_buf = kmalloc(len, GFP_KERNEL); 509 if (tmp_buf) 510 memcpy(tmp_buf, p->buf, old_buf_len); 511 } else { 512 tmp_buf = krealloc(p->buf, len, GFP_KERNEL); 513 } 514 if (!tmp_buf) 515 return -ENOMEM; 516 p->buf = tmp_buf; 517 p->buf_len = len; 518 519 if (p->reversed) { 520 tmp_buf = p->buf + old_buf_len - path_len - 1; 521 p->end = p->buf + p->buf_len - 1; 522 p->start = p->end - path_len; 523 memmove(p->start, tmp_buf, path_len + 1); 524 } else { 525 p->start = p->buf; 526 p->end = p->start + path_len; 527 } 528 return 0; 529 } 530 531 static int fs_path_prepare_for_add(struct fs_path *p, int name_len, 532 char **prepared) 533 { 534 int ret; 535 int new_len; 536 537 new_len = fs_path_len(p) + name_len; 538 if (p->start != p->end) 539 new_len++; 540 ret = fs_path_ensure_buf(p, new_len); 541 if (ret < 0) 542 return ret; 543 544 if (p->reversed) { 545 if (p->start != p->end) 546 *--p->start = '/'; 547 p->start -= name_len; 548 *prepared = p->start; 549 } else { 550 if (p->start != p->end) 551 *p->end++ = '/'; 552 *prepared = p->end; 553 p->end += name_len; 554 *p->end = 0; 555 } 556 557 return 0; 558 } 559 560 static int fs_path_add(struct fs_path *p, const char *name, int name_len) 561 { 562 int ret; 563 char *prepared; 564 565 ret = fs_path_prepare_for_add(p, name_len, &prepared); 566 if (ret < 0) 567 return ret; 568 memcpy(prepared, name, name_len); 569 570 return 0; 571 } 572 573 static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) 574 { 575 return fs_path_add(p, p2->start, fs_path_len(p2)); 576 } 577 578 static int fs_path_add_from_extent_buffer(struct fs_path *p, 579 struct extent_buffer *eb, 580 unsigned long off, int len) 581 { 582 int ret; 583 char *prepared; 584 585 ret = fs_path_prepare_for_add(p, len, &prepared); 586 if (ret < 0) 587 return ret; 588 589 read_extent_buffer(eb, prepared, off, len); 590 591 return 0; 592 } 593 594 static int fs_path_copy(struct fs_path *p, struct fs_path *from) 595 { 596 p->reversed = from->reversed; 597 fs_path_reset(p); 598 599 return fs_path_add_path(p, from); 600 } 601 602 static void fs_path_unreverse(struct fs_path *p) 603 { 604 char *tmp; 605 int len; 606 607 if (!p->reversed) 608 return; 609 610 tmp = p->start; 611 len = fs_path_len(p); 612 p->start = p->buf; 613 p->end = p->start + len; 614 memmove(p->start, tmp, len + 1); 615 p->reversed = 0; 616 } 617 618 static inline bool is_current_inode_path(const struct send_ctx *sctx, 619 const struct fs_path *path) 620 { 621 const struct fs_path *cur = &sctx->cur_inode_path; 622 623 return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); 624 } 625 626 static struct btrfs_path *alloc_path_for_send(void) 627 { 628 struct btrfs_path *path; 629 630 path = btrfs_alloc_path(); 631 if (!path) 632 return NULL; 633 path->search_commit_root = 1; 634 path->skip_locking = 1; 635 path->need_commit_sem = 1; 636 return path; 637 } 638 639 static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) 640 { 641 int ret; 642 u32 pos = 0; 643 644 while (pos < len) { 645 ret = kernel_write(filp, buf + pos, len - pos, off); 646 if (ret < 0) 647 return ret; 648 if (ret == 0) 649 return -EIO; 650 pos += ret; 651 } 652 653 return 0; 654 } 655 656 static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) 657 { 658 struct btrfs_tlv_header *hdr; 659 int total_len = sizeof(*hdr) + len; 660 int left = sctx->send_max_size - sctx->send_size; 661 662 if (WARN_ON_ONCE(sctx->put_data)) 663 return -EINVAL; 664 665 if (unlikely(left < total_len)) 666 return -EOVERFLOW; 667 668 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); 669 put_unaligned_le16(attr, &hdr->tlv_type); 670 put_unaligned_le16(len, &hdr->tlv_len); 671 memcpy(hdr + 1, data, len); 672 sctx->send_size += total_len; 673 674 return 0; 675 } 676 677 #define TLV_PUT_DEFINE_INT(bits) \ 678 static int tlv_put_u##bits(struct send_ctx *sctx, \ 679 u##bits attr, u##bits value) \ 680 { \ 681 __le##bits __tmp = cpu_to_le##bits(value); \ 682 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ 683 } 684 685 TLV_PUT_DEFINE_INT(8) 686 TLV_PUT_DEFINE_INT(32) 687 TLV_PUT_DEFINE_INT(64) 688 689 static int tlv_put_string(struct send_ctx *sctx, u16 attr, 690 const char *str, int len) 691 { 692 if (len == -1) 693 len = strlen(str); 694 return tlv_put(sctx, attr, str, len); 695 } 696 697 static int tlv_put_uuid(struct send_ctx *sctx, u16 attr, 698 const u8 *uuid) 699 { 700 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); 701 } 702 703 static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, 704 struct extent_buffer *eb, 705 struct btrfs_timespec *ts) 706 { 707 struct btrfs_timespec bts; 708 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts)); 709 return tlv_put(sctx, attr, &bts, sizeof(bts)); 710 } 711 712 713 #define TLV_PUT(sctx, attrtype, data, attrlen) \ 714 do { \ 715 ret = tlv_put(sctx, attrtype, data, attrlen); \ 716 if (ret < 0) \ 717 goto tlv_put_failure; \ 718 } while (0) 719 720 #define TLV_PUT_INT(sctx, attrtype, bits, value) \ 721 do { \ 722 ret = tlv_put_u##bits(sctx, attrtype, value); \ 723 if (ret < 0) \ 724 goto tlv_put_failure; \ 725 } while (0) 726 727 #define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data) 728 #define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data) 729 #define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data) 730 #define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data) 731 #define TLV_PUT_STRING(sctx, attrtype, str, len) \ 732 do { \ 733 ret = tlv_put_string(sctx, attrtype, str, len); \ 734 if (ret < 0) \ 735 goto tlv_put_failure; \ 736 } while (0) 737 #define TLV_PUT_PATH(sctx, attrtype, p) \ 738 do { \ 739 ret = tlv_put_string(sctx, attrtype, p->start, \ 740 fs_path_len((p))); \ 741 if (ret < 0) \ 742 goto tlv_put_failure; \ 743 } while(0) 744 #define TLV_PUT_UUID(sctx, attrtype, uuid) \ 745 do { \ 746 ret = tlv_put_uuid(sctx, attrtype, uuid); \ 747 if (ret < 0) \ 748 goto tlv_put_failure; \ 749 } while (0) 750 #define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ 751 do { \ 752 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ 753 if (ret < 0) \ 754 goto tlv_put_failure; \ 755 } while (0) 756 757 static int send_header(struct send_ctx *sctx) 758 { 759 struct btrfs_stream_header hdr; 760 761 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); 762 hdr.version = cpu_to_le32(sctx->proto); 763 return write_buf(sctx->send_filp, &hdr, sizeof(hdr), 764 &sctx->send_off); 765 } 766 767 /* 768 * For each command/item we want to send to userspace, we call this function. 769 */ 770 static int begin_cmd(struct send_ctx *sctx, int cmd) 771 { 772 struct btrfs_cmd_header *hdr; 773 774 if (WARN_ON(!sctx->send_buf)) 775 return -EINVAL; 776 777 if (unlikely(sctx->send_size != 0)) { 778 btrfs_err(sctx->send_root->fs_info, 779 "send: command header buffer not empty cmd %d offset %llu", 780 cmd, sctx->send_off); 781 return -EINVAL; 782 } 783 784 sctx->send_size += sizeof(*hdr); 785 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 786 put_unaligned_le16(cmd, &hdr->cmd); 787 788 return 0; 789 } 790 791 static int send_cmd(struct send_ctx *sctx) 792 { 793 int ret; 794 struct btrfs_cmd_header *hdr; 795 u32 crc; 796 797 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 798 put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); 799 put_unaligned_le32(0, &hdr->crc); 800 801 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 802 put_unaligned_le32(crc, &hdr->crc); 803 804 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 805 &sctx->send_off); 806 807 sctx->send_size = 0; 808 sctx->put_data = false; 809 810 return ret; 811 } 812 813 /* 814 * Sends a move instruction to user space 815 */ 816 static int send_rename(struct send_ctx *sctx, 817 struct fs_path *from, struct fs_path *to) 818 { 819 int ret; 820 821 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); 822 if (ret < 0) 823 return ret; 824 825 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); 826 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); 827 828 ret = send_cmd(sctx); 829 830 tlv_put_failure: 831 return ret; 832 } 833 834 /* 835 * Sends a link instruction to user space 836 */ 837 static int send_link(struct send_ctx *sctx, 838 struct fs_path *path, struct fs_path *lnk) 839 { 840 int ret; 841 842 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); 843 if (ret < 0) 844 return ret; 845 846 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 847 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); 848 849 ret = send_cmd(sctx); 850 851 tlv_put_failure: 852 return ret; 853 } 854 855 /* 856 * Sends an unlink instruction to user space 857 */ 858 static int send_unlink(struct send_ctx *sctx, struct fs_path *path) 859 { 860 int ret; 861 862 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); 863 if (ret < 0) 864 return ret; 865 866 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 867 868 ret = send_cmd(sctx); 869 870 tlv_put_failure: 871 return ret; 872 } 873 874 /* 875 * Sends a rmdir instruction to user space 876 */ 877 static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) 878 { 879 int ret; 880 881 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); 882 if (ret < 0) 883 return ret; 884 885 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 886 887 ret = send_cmd(sctx); 888 889 tlv_put_failure: 890 return ret; 891 } 892 893 struct btrfs_inode_info { 894 u64 size; 895 u64 gen; 896 u64 mode; 897 u64 uid; 898 u64 gid; 899 u64 rdev; 900 u64 fileattr; 901 u64 nlink; 902 }; 903 904 /* 905 * Helper function to retrieve some fields from an inode item. 906 */ 907 static int get_inode_info(struct btrfs_root *root, u64 ino, 908 struct btrfs_inode_info *info) 909 { 910 int ret; 911 struct btrfs_path *path; 912 struct btrfs_inode_item *ii; 913 struct btrfs_key key; 914 915 path = alloc_path_for_send(); 916 if (!path) 917 return -ENOMEM; 918 919 key.objectid = ino; 920 key.type = BTRFS_INODE_ITEM_KEY; 921 key.offset = 0; 922 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 923 if (ret) { 924 if (ret > 0) 925 ret = -ENOENT; 926 goto out; 927 } 928 929 if (!info) 930 goto out; 931 932 ii = btrfs_item_ptr(path->nodes[0], path->slots[0], 933 struct btrfs_inode_item); 934 info->size = btrfs_inode_size(path->nodes[0], ii); 935 info->gen = btrfs_inode_generation(path->nodes[0], ii); 936 info->mode = btrfs_inode_mode(path->nodes[0], ii); 937 info->uid = btrfs_inode_uid(path->nodes[0], ii); 938 info->gid = btrfs_inode_gid(path->nodes[0], ii); 939 info->rdev = btrfs_inode_rdev(path->nodes[0], ii); 940 info->nlink = btrfs_inode_nlink(path->nodes[0], ii); 941 /* 942 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's 943 * otherwise logically split to 32/32 parts. 944 */ 945 info->fileattr = btrfs_inode_flags(path->nodes[0], ii); 946 947 out: 948 btrfs_free_path(path); 949 return ret; 950 } 951 952 static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen) 953 { 954 int ret; 955 struct btrfs_inode_info info = { 0 }; 956 957 ASSERT(gen); 958 959 ret = get_inode_info(root, ino, &info); 960 *gen = info.gen; 961 return ret; 962 } 963 964 typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx); 965 966 /* 967 * Helper function to iterate the entries in ONE btrfs_inode_ref or 968 * btrfs_inode_extref. 969 * The iterate callback may return a non zero value to stop iteration. This can 970 * be a negative value for error codes or 1 to simply stop it. 971 * 972 * path must point to the INODE_REF or INODE_EXTREF when called. 973 */ 974 static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, 975 struct btrfs_key *found_key, int resolve, 976 iterate_inode_ref_t iterate, void *ctx) 977 { 978 struct extent_buffer *eb = path->nodes[0]; 979 struct btrfs_inode_ref *iref; 980 struct btrfs_inode_extref *extref; 981 struct btrfs_path *tmp_path; 982 struct fs_path *p; 983 u32 cur = 0; 984 u32 total; 985 int slot = path->slots[0]; 986 u32 name_len; 987 char *start; 988 int ret = 0; 989 u64 dir; 990 unsigned long name_off; 991 unsigned long elem_size; 992 unsigned long ptr; 993 994 p = fs_path_alloc_reversed(); 995 if (!p) 996 return -ENOMEM; 997 998 tmp_path = alloc_path_for_send(); 999 if (!tmp_path) { 1000 fs_path_free(p); 1001 return -ENOMEM; 1002 } 1003 1004 1005 if (found_key->type == BTRFS_INODE_REF_KEY) { 1006 ptr = (unsigned long)btrfs_item_ptr(eb, slot, 1007 struct btrfs_inode_ref); 1008 total = btrfs_item_size(eb, slot); 1009 elem_size = sizeof(*iref); 1010 } else { 1011 ptr = btrfs_item_ptr_offset(eb, slot); 1012 total = btrfs_item_size(eb, slot); 1013 elem_size = sizeof(*extref); 1014 } 1015 1016 while (cur < total) { 1017 fs_path_reset(p); 1018 1019 if (found_key->type == BTRFS_INODE_REF_KEY) { 1020 iref = (struct btrfs_inode_ref *)(ptr + cur); 1021 name_len = btrfs_inode_ref_name_len(eb, iref); 1022 name_off = (unsigned long)(iref + 1); 1023 dir = found_key->offset; 1024 } else { 1025 extref = (struct btrfs_inode_extref *)(ptr + cur); 1026 name_len = btrfs_inode_extref_name_len(eb, extref); 1027 name_off = (unsigned long)&extref->name; 1028 dir = btrfs_inode_extref_parent(eb, extref); 1029 } 1030 1031 if (resolve) { 1032 start = btrfs_ref_to_path(root, tmp_path, name_len, 1033 name_off, eb, dir, 1034 p->buf, p->buf_len); 1035 if (IS_ERR(start)) { 1036 ret = PTR_ERR(start); 1037 goto out; 1038 } 1039 if (start < p->buf) { 1040 /* overflow , try again with larger buffer */ 1041 ret = fs_path_ensure_buf(p, 1042 p->buf_len + p->buf - start); 1043 if (ret < 0) 1044 goto out; 1045 start = btrfs_ref_to_path(root, tmp_path, 1046 name_len, name_off, 1047 eb, dir, 1048 p->buf, p->buf_len); 1049 if (IS_ERR(start)) { 1050 ret = PTR_ERR(start); 1051 goto out; 1052 } 1053 if (unlikely(start < p->buf)) { 1054 btrfs_err(root->fs_info, 1055 "send: path ref buffer underflow for key (%llu %u %llu)", 1056 found_key->objectid, 1057 found_key->type, 1058 found_key->offset); 1059 ret = -EINVAL; 1060 goto out; 1061 } 1062 } 1063 p->start = start; 1064 } else { 1065 ret = fs_path_add_from_extent_buffer(p, eb, name_off, 1066 name_len); 1067 if (ret < 0) 1068 goto out; 1069 } 1070 1071 cur += elem_size + name_len; 1072 ret = iterate(dir, p, ctx); 1073 if (ret) 1074 goto out; 1075 } 1076 1077 out: 1078 btrfs_free_path(tmp_path); 1079 fs_path_free(p); 1080 return ret; 1081 } 1082 1083 typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, 1084 const char *name, int name_len, 1085 const char *data, int data_len, 1086 void *ctx); 1087 1088 /* 1089 * Helper function to iterate the entries in ONE btrfs_dir_item. 1090 * The iterate callback may return a non zero value to stop iteration. This can 1091 * be a negative value for error codes or 1 to simply stop it. 1092 * 1093 * path must point to the dir item when called. 1094 */ 1095 static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, 1096 iterate_dir_item_t iterate, void *ctx) 1097 { 1098 int ret = 0; 1099 struct extent_buffer *eb; 1100 struct btrfs_dir_item *di; 1101 struct btrfs_key di_key; 1102 char *buf = NULL; 1103 int buf_len; 1104 u32 name_len; 1105 u32 data_len; 1106 u32 cur; 1107 u32 len; 1108 u32 total; 1109 int slot; 1110 int num; 1111 1112 /* 1113 * Start with a small buffer (1 page). If later we end up needing more 1114 * space, which can happen for xattrs on a fs with a leaf size greater 1115 * than the page size, attempt to increase the buffer. Typically xattr 1116 * values are small. 1117 */ 1118 buf_len = PATH_MAX; 1119 buf = kmalloc(buf_len, GFP_KERNEL); 1120 if (!buf) { 1121 ret = -ENOMEM; 1122 goto out; 1123 } 1124 1125 eb = path->nodes[0]; 1126 slot = path->slots[0]; 1127 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1128 cur = 0; 1129 len = 0; 1130 total = btrfs_item_size(eb, slot); 1131 1132 num = 0; 1133 while (cur < total) { 1134 name_len = btrfs_dir_name_len(eb, di); 1135 data_len = btrfs_dir_data_len(eb, di); 1136 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 1137 1138 if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { 1139 if (name_len > XATTR_NAME_MAX) { 1140 ret = -ENAMETOOLONG; 1141 goto out; 1142 } 1143 if (name_len + data_len > 1144 BTRFS_MAX_XATTR_SIZE(root->fs_info)) { 1145 ret = -E2BIG; 1146 goto out; 1147 } 1148 } else { 1149 /* 1150 * Path too long 1151 */ 1152 if (name_len + data_len > PATH_MAX) { 1153 ret = -ENAMETOOLONG; 1154 goto out; 1155 } 1156 } 1157 1158 if (name_len + data_len > buf_len) { 1159 buf_len = name_len + data_len; 1160 if (is_vmalloc_addr(buf)) { 1161 vfree(buf); 1162 buf = NULL; 1163 } else { 1164 char *tmp = krealloc(buf, buf_len, 1165 GFP_KERNEL | __GFP_NOWARN); 1166 1167 if (!tmp) 1168 kfree(buf); 1169 buf = tmp; 1170 } 1171 if (!buf) { 1172 buf = kvmalloc(buf_len, GFP_KERNEL); 1173 if (!buf) { 1174 ret = -ENOMEM; 1175 goto out; 1176 } 1177 } 1178 } 1179 1180 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1181 name_len + data_len); 1182 1183 len = sizeof(*di) + name_len + data_len; 1184 di = (struct btrfs_dir_item *)((char *)di + len); 1185 cur += len; 1186 1187 ret = iterate(num, &di_key, buf, name_len, buf + name_len, 1188 data_len, ctx); 1189 if (ret < 0) 1190 goto out; 1191 if (ret) { 1192 ret = 0; 1193 goto out; 1194 } 1195 1196 num++; 1197 } 1198 1199 out: 1200 kvfree(buf); 1201 return ret; 1202 } 1203 1204 static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx) 1205 { 1206 int ret; 1207 struct fs_path *pt = ctx; 1208 1209 ret = fs_path_copy(pt, p); 1210 if (ret < 0) 1211 return ret; 1212 1213 /* we want the first only */ 1214 return 1; 1215 } 1216 1217 /* 1218 * Retrieve the first path of an inode. If an inode has more then one 1219 * ref/hardlink, this is ignored. 1220 */ 1221 static int get_inode_path(struct btrfs_root *root, 1222 u64 ino, struct fs_path *path) 1223 { 1224 int ret; 1225 struct btrfs_key key, found_key; 1226 struct btrfs_path *p; 1227 1228 p = alloc_path_for_send(); 1229 if (!p) 1230 return -ENOMEM; 1231 1232 fs_path_reset(path); 1233 1234 key.objectid = ino; 1235 key.type = BTRFS_INODE_REF_KEY; 1236 key.offset = 0; 1237 1238 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0); 1239 if (ret < 0) 1240 goto out; 1241 if (ret) { 1242 ret = 1; 1243 goto out; 1244 } 1245 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]); 1246 if (found_key.objectid != ino || 1247 (found_key.type != BTRFS_INODE_REF_KEY && 1248 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 1249 ret = -ENOENT; 1250 goto out; 1251 } 1252 1253 ret = iterate_inode_ref(root, p, &found_key, 1, 1254 __copy_first_ref, path); 1255 if (ret < 0) 1256 goto out; 1257 ret = 0; 1258 1259 out: 1260 btrfs_free_path(p); 1261 return ret; 1262 } 1263 1264 struct backref_ctx { 1265 struct send_ctx *sctx; 1266 1267 /* number of total found references */ 1268 u64 found; 1269 1270 /* 1271 * used for clones found in send_root. clones found behind cur_objectid 1272 * and cur_offset are not considered as allowed clones. 1273 */ 1274 u64 cur_objectid; 1275 u64 cur_offset; 1276 1277 /* may be truncated in case it's the last extent in a file */ 1278 u64 extent_len; 1279 1280 /* The bytenr the file extent item we are processing refers to. */ 1281 u64 bytenr; 1282 /* The owner (root id) of the data backref for the current extent. */ 1283 u64 backref_owner; 1284 /* The offset of the data backref for the current extent. */ 1285 u64 backref_offset; 1286 }; 1287 1288 static int __clone_root_cmp_bsearch(const void *key, const void *elt) 1289 { 1290 u64 root = (u64)(uintptr_t)key; 1291 const struct clone_root *cr = elt; 1292 1293 if (root < btrfs_root_id(cr->root)) 1294 return -1; 1295 if (root > btrfs_root_id(cr->root)) 1296 return 1; 1297 return 0; 1298 } 1299 1300 static int __clone_root_cmp_sort(const void *e1, const void *e2) 1301 { 1302 const struct clone_root *cr1 = e1; 1303 const struct clone_root *cr2 = e2; 1304 1305 if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) 1306 return -1; 1307 if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) 1308 return 1; 1309 return 0; 1310 } 1311 1312 /* 1313 * Called for every backref that is found for the current extent. 1314 * Results are collected in sctx->clone_roots->ino/offset. 1315 */ 1316 static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, 1317 void *ctx_) 1318 { 1319 struct backref_ctx *bctx = ctx_; 1320 struct clone_root *clone_root; 1321 1322 /* First check if the root is in the list of accepted clone sources */ 1323 clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, 1324 bctx->sctx->clone_roots_cnt, 1325 sizeof(struct clone_root), 1326 __clone_root_cmp_bsearch); 1327 if (!clone_root) 1328 return 0; 1329 1330 /* This is our own reference, bail out as we can't clone from it. */ 1331 if (clone_root->root == bctx->sctx->send_root && 1332 ino == bctx->cur_objectid && 1333 offset == bctx->cur_offset) 1334 return 0; 1335 1336 /* 1337 * Make sure we don't consider clones from send_root that are 1338 * behind the current inode/offset. 1339 */ 1340 if (clone_root->root == bctx->sctx->send_root) { 1341 /* 1342 * If the source inode was not yet processed we can't issue a 1343 * clone operation, as the source extent does not exist yet at 1344 * the destination of the stream. 1345 */ 1346 if (ino > bctx->cur_objectid) 1347 return 0; 1348 /* 1349 * We clone from the inode currently being sent as long as the 1350 * source extent is already processed, otherwise we could try 1351 * to clone from an extent that does not exist yet at the 1352 * destination of the stream. 1353 */ 1354 if (ino == bctx->cur_objectid && 1355 offset + bctx->extent_len > 1356 bctx->sctx->cur_inode_next_write_offset) 1357 return 0; 1358 } 1359 1360 bctx->found++; 1361 clone_root->found_ref = true; 1362 1363 /* 1364 * If the given backref refers to a file extent item with a larger 1365 * number of bytes than what we found before, use the new one so that 1366 * we clone more optimally and end up doing less writes and getting 1367 * less exclusive, non-shared extents at the destination. 1368 */ 1369 if (num_bytes > clone_root->num_bytes) { 1370 clone_root->ino = ino; 1371 clone_root->offset = offset; 1372 clone_root->num_bytes = num_bytes; 1373 1374 /* 1375 * Found a perfect candidate, so there's no need to continue 1376 * backref walking. 1377 */ 1378 if (num_bytes >= bctx->extent_len) 1379 return BTRFS_ITERATE_EXTENT_INODES_STOP; 1380 } 1381 1382 return 0; 1383 } 1384 1385 static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, 1386 const u64 **root_ids_ret, int *root_count_ret) 1387 { 1388 struct backref_ctx *bctx = ctx; 1389 struct send_ctx *sctx = bctx->sctx; 1390 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1391 const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; 1392 struct btrfs_lru_cache_entry *raw_entry; 1393 struct backref_cache_entry *entry; 1394 1395 if (sctx->backref_cache.size == 0) 1396 return false; 1397 1398 /* 1399 * If relocation happened since we first filled the cache, then we must 1400 * empty the cache and can not use it, because even though we operate on 1401 * read-only roots, their leaves and nodes may have been reallocated and 1402 * now be used for different nodes/leaves of the same tree or some other 1403 * tree. 1404 * 1405 * We are called from iterate_extent_inodes() while either holding a 1406 * transaction handle or holding fs_info->commit_root_sem, so no need 1407 * to take any lock here. 1408 */ 1409 if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) { 1410 btrfs_lru_cache_clear(&sctx->backref_cache); 1411 return false; 1412 } 1413 1414 raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0); 1415 if (!raw_entry) 1416 return false; 1417 1418 entry = container_of(raw_entry, struct backref_cache_entry, entry); 1419 *root_ids_ret = entry->root_ids; 1420 *root_count_ret = entry->num_roots; 1421 1422 return true; 1423 } 1424 1425 static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, 1426 void *ctx) 1427 { 1428 struct backref_ctx *bctx = ctx; 1429 struct send_ctx *sctx = bctx->sctx; 1430 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1431 struct backref_cache_entry *new_entry; 1432 struct ulist_iterator uiter; 1433 struct ulist_node *node; 1434 int ret; 1435 1436 /* 1437 * We're called while holding a transaction handle or while holding 1438 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a 1439 * NOFS allocation. 1440 */ 1441 new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); 1442 /* No worries, cache is optional. */ 1443 if (!new_entry) 1444 return; 1445 1446 new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits; 1447 new_entry->entry.gen = 0; 1448 new_entry->num_roots = 0; 1449 ULIST_ITER_INIT(&uiter); 1450 while ((node = ulist_next(root_ids, &uiter)) != NULL) { 1451 const u64 root_id = node->val; 1452 struct clone_root *root; 1453 1454 root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, 1455 sctx->clone_roots_cnt, sizeof(struct clone_root), 1456 __clone_root_cmp_bsearch); 1457 if (!root) 1458 continue; 1459 1460 /* Too many roots, just exit, no worries as caching is optional. */ 1461 if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { 1462 kfree(new_entry); 1463 return; 1464 } 1465 1466 new_entry->root_ids[new_entry->num_roots] = root_id; 1467 new_entry->num_roots++; 1468 } 1469 1470 /* 1471 * We may have not added any roots to the new cache entry, which means 1472 * none of the roots is part of the list of roots from which we are 1473 * allowed to clone. Cache the new entry as it's still useful to avoid 1474 * backref walking to determine which roots have a path to the leaf. 1475 * 1476 * Also use GFP_NOFS because we're called while holding a transaction 1477 * handle or while holding fs_info->commit_root_sem. 1478 */ 1479 ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry, 1480 GFP_NOFS); 1481 ASSERT(ret == 0 || ret == -ENOMEM); 1482 if (ret) { 1483 /* Caching is optional, no worries. */ 1484 kfree(new_entry); 1485 return; 1486 } 1487 1488 /* 1489 * We are called from iterate_extent_inodes() while either holding a 1490 * transaction handle or holding fs_info->commit_root_sem, so no need 1491 * to take any lock here. 1492 */ 1493 if (sctx->backref_cache.size == 1) 1494 sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans; 1495 } 1496 1497 static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, 1498 const struct extent_buffer *leaf, void *ctx) 1499 { 1500 const u64 refs = btrfs_extent_refs(leaf, ei); 1501 const struct backref_ctx *bctx = ctx; 1502 const struct send_ctx *sctx = bctx->sctx; 1503 1504 if (bytenr == bctx->bytenr) { 1505 const u64 flags = btrfs_extent_flags(leaf, ei); 1506 1507 if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 1508 return -EUCLEAN; 1509 1510 /* 1511 * If we have only one reference and only the send root as a 1512 * clone source - meaning no clone roots were given in the 1513 * struct btrfs_ioctl_send_args passed to the send ioctl - then 1514 * it's our reference and there's no point in doing backref 1515 * walking which is expensive, so exit early. 1516 */ 1517 if (refs == 1 && sctx->clone_roots_cnt == 1) 1518 return -ENOENT; 1519 } 1520 1521 /* 1522 * Backreference walking (iterate_extent_inodes() below) is currently 1523 * too expensive when an extent has a large number of references, both 1524 * in time spent and used memory. So for now just fallback to write 1525 * operations instead of clone operations when an extent has more than 1526 * a certain amount of references. 1527 */ 1528 if (refs > SEND_MAX_EXTENT_REFS) 1529 return -ENOENT; 1530 1531 return 0; 1532 } 1533 1534 static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) 1535 { 1536 const struct backref_ctx *bctx = ctx; 1537 1538 if (ino == bctx->cur_objectid && 1539 root == bctx->backref_owner && 1540 offset == bctx->backref_offset) 1541 return true; 1542 1543 return false; 1544 } 1545 1546 /* 1547 * Given an inode, offset and extent item, it finds a good clone for a clone 1548 * instruction. Returns -ENOENT when none could be found. The function makes 1549 * sure that the returned clone is usable at the point where sending is at the 1550 * moment. This means, that no clones are accepted which lie behind the current 1551 * inode+offset. 1552 * 1553 * path must point to the extent item when called. 1554 */ 1555 static int find_extent_clone(struct send_ctx *sctx, 1556 struct btrfs_path *path, 1557 u64 ino, u64 data_offset, 1558 u64 ino_size, 1559 struct clone_root **found) 1560 { 1561 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 1562 int ret; 1563 int extent_type; 1564 u64 disk_byte; 1565 u64 num_bytes; 1566 struct btrfs_file_extent_item *fi; 1567 struct extent_buffer *eb = path->nodes[0]; 1568 struct backref_ctx backref_ctx = { 0 }; 1569 struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; 1570 struct clone_root *cur_clone_root; 1571 int compressed; 1572 u32 i; 1573 1574 /* 1575 * With fallocate we can get prealloc extents beyond the inode's i_size, 1576 * so we don't do anything here because clone operations can not clone 1577 * to a range beyond i_size without increasing the i_size of the 1578 * destination inode. 1579 */ 1580 if (data_offset >= ino_size) 1581 return 0; 1582 1583 fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); 1584 extent_type = btrfs_file_extent_type(eb, fi); 1585 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1586 return -ENOENT; 1587 1588 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1589 if (disk_byte == 0) 1590 return -ENOENT; 1591 1592 compressed = btrfs_file_extent_compression(eb, fi); 1593 num_bytes = btrfs_file_extent_num_bytes(eb, fi); 1594 1595 /* 1596 * Setup the clone roots. 1597 */ 1598 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1599 cur_clone_root = sctx->clone_roots + i; 1600 cur_clone_root->ino = (u64)-1; 1601 cur_clone_root->offset = 0; 1602 cur_clone_root->num_bytes = 0; 1603 cur_clone_root->found_ref = false; 1604 } 1605 1606 backref_ctx.sctx = sctx; 1607 backref_ctx.cur_objectid = ino; 1608 backref_ctx.cur_offset = data_offset; 1609 backref_ctx.bytenr = disk_byte; 1610 /* 1611 * Use the header owner and not the send root's id, because in case of a 1612 * snapshot we can have shared subtrees. 1613 */ 1614 backref_ctx.backref_owner = btrfs_header_owner(eb); 1615 backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); 1616 1617 /* 1618 * The last extent of a file may be too large due to page alignment. 1619 * We need to adjust extent_len in this case so that the checks in 1620 * iterate_backrefs() work. 1621 */ 1622 if (data_offset + num_bytes >= ino_size) 1623 backref_ctx.extent_len = ino_size - data_offset; 1624 else 1625 backref_ctx.extent_len = num_bytes; 1626 1627 /* 1628 * Now collect all backrefs. 1629 */ 1630 backref_walk_ctx.bytenr = disk_byte; 1631 if (compressed == BTRFS_COMPRESS_NONE) 1632 backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); 1633 backref_walk_ctx.fs_info = fs_info; 1634 backref_walk_ctx.cache_lookup = lookup_backref_cache; 1635 backref_walk_ctx.cache_store = store_backref_cache; 1636 backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; 1637 backref_walk_ctx.check_extent_item = check_extent_item; 1638 backref_walk_ctx.user_ctx = &backref_ctx; 1639 1640 /* 1641 * If have a single clone root, then it's the send root and we can tell 1642 * the backref walking code to skip our own backref and not resolve it, 1643 * since we can not use it for cloning - the source and destination 1644 * ranges can't overlap and in case the leaf is shared through a subtree 1645 * due to snapshots, we can't use those other roots since they are not 1646 * in the list of clone roots. 1647 */ 1648 if (sctx->clone_roots_cnt == 1) 1649 backref_walk_ctx.skip_data_ref = skip_self_data_ref; 1650 1651 ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, 1652 &backref_ctx); 1653 if (ret < 0) 1654 return ret; 1655 1656 down_read(&fs_info->commit_root_sem); 1657 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 1658 /* 1659 * A transaction commit for a transaction in which block group 1660 * relocation was done just happened. 1661 * The disk_bytenr of the file extent item we processed is 1662 * possibly stale, referring to the extent's location before 1663 * relocation. So act as if we haven't found any clone sources 1664 * and fallback to write commands, which will read the correct 1665 * data from the new extent location. Otherwise we will fail 1666 * below because we haven't found our own back reference or we 1667 * could be getting incorrect sources in case the old extent 1668 * was already reallocated after the relocation. 1669 */ 1670 up_read(&fs_info->commit_root_sem); 1671 return -ENOENT; 1672 } 1673 up_read(&fs_info->commit_root_sem); 1674 1675 if (!backref_ctx.found) 1676 return -ENOENT; 1677 1678 cur_clone_root = NULL; 1679 for (i = 0; i < sctx->clone_roots_cnt; i++) { 1680 struct clone_root *clone_root = &sctx->clone_roots[i]; 1681 1682 if (!clone_root->found_ref) 1683 continue; 1684 1685 /* 1686 * Choose the root from which we can clone more bytes, to 1687 * minimize write operations and therefore have more extent 1688 * sharing at the destination (the same as in the source). 1689 */ 1690 if (!cur_clone_root || 1691 clone_root->num_bytes > cur_clone_root->num_bytes) { 1692 cur_clone_root = clone_root; 1693 1694 /* 1695 * We found an optimal clone candidate (any inode from 1696 * any root is fine), so we're done. 1697 */ 1698 if (clone_root->num_bytes >= backref_ctx.extent_len) 1699 break; 1700 } 1701 } 1702 1703 if (cur_clone_root) { 1704 *found = cur_clone_root; 1705 ret = 0; 1706 } else { 1707 ret = -ENOENT; 1708 } 1709 1710 return ret; 1711 } 1712 1713 static int read_symlink(struct btrfs_root *root, 1714 u64 ino, 1715 struct fs_path *dest) 1716 { 1717 int ret; 1718 struct btrfs_path *path; 1719 struct btrfs_key key; 1720 struct btrfs_file_extent_item *ei; 1721 u8 type; 1722 u8 compression; 1723 unsigned long off; 1724 int len; 1725 1726 path = alloc_path_for_send(); 1727 if (!path) 1728 return -ENOMEM; 1729 1730 key.objectid = ino; 1731 key.type = BTRFS_EXTENT_DATA_KEY; 1732 key.offset = 0; 1733 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1734 if (ret < 0) 1735 goto out; 1736 if (ret) { 1737 /* 1738 * An empty symlink inode. Can happen in rare error paths when 1739 * creating a symlink (transaction committed before the inode 1740 * eviction handler removed the symlink inode items and a crash 1741 * happened in between or the subvol was snapshoted in between). 1742 * Print an informative message to dmesg/syslog so that the user 1743 * can delete the symlink. 1744 */ 1745 btrfs_err(root->fs_info, 1746 "Found empty symlink inode %llu at root %llu", 1747 ino, btrfs_root_id(root)); 1748 ret = -EIO; 1749 goto out; 1750 } 1751 1752 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 1753 struct btrfs_file_extent_item); 1754 type = btrfs_file_extent_type(path->nodes[0], ei); 1755 if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { 1756 ret = -EUCLEAN; 1757 btrfs_crit(root->fs_info, 1758 "send: found symlink extent that is not inline, ino %llu root %llu extent type %d", 1759 ino, btrfs_root_id(root), type); 1760 goto out; 1761 } 1762 compression = btrfs_file_extent_compression(path->nodes[0], ei); 1763 if (unlikely(compression != BTRFS_COMPRESS_NONE)) { 1764 ret = -EUCLEAN; 1765 btrfs_crit(root->fs_info, 1766 "send: found symlink extent with compression, ino %llu root %llu compression type %d", 1767 ino, btrfs_root_id(root), compression); 1768 goto out; 1769 } 1770 1771 off = btrfs_file_extent_inline_start(ei); 1772 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); 1773 1774 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1775 1776 out: 1777 btrfs_free_path(path); 1778 return ret; 1779 } 1780 1781 /* 1782 * Helper function to generate a file name that is unique in the root of 1783 * send_root and parent_root. This is used to generate names for orphan inodes. 1784 */ 1785 static int gen_unique_name(struct send_ctx *sctx, 1786 u64 ino, u64 gen, 1787 struct fs_path *dest) 1788 { 1789 int ret = 0; 1790 struct btrfs_path *path; 1791 struct btrfs_dir_item *di; 1792 char tmp[64]; 1793 int len; 1794 u64 idx = 0; 1795 1796 path = alloc_path_for_send(); 1797 if (!path) 1798 return -ENOMEM; 1799 1800 while (1) { 1801 struct fscrypt_str tmp_name; 1802 1803 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", 1804 ino, gen, idx); 1805 ASSERT(len < sizeof(tmp)); 1806 tmp_name.name = tmp; 1807 tmp_name.len = strlen(tmp); 1808 1809 di = btrfs_lookup_dir_item(NULL, sctx->send_root, 1810 path, BTRFS_FIRST_FREE_OBJECTID, 1811 &tmp_name, 0); 1812 btrfs_release_path(path); 1813 if (IS_ERR(di)) { 1814 ret = PTR_ERR(di); 1815 goto out; 1816 } 1817 if (di) { 1818 /* not unique, try again */ 1819 idx++; 1820 continue; 1821 } 1822 1823 if (!sctx->parent_root) { 1824 /* unique */ 1825 ret = 0; 1826 break; 1827 } 1828 1829 di = btrfs_lookup_dir_item(NULL, sctx->parent_root, 1830 path, BTRFS_FIRST_FREE_OBJECTID, 1831 &tmp_name, 0); 1832 btrfs_release_path(path); 1833 if (IS_ERR(di)) { 1834 ret = PTR_ERR(di); 1835 goto out; 1836 } 1837 if (di) { 1838 /* not unique, try again */ 1839 idx++; 1840 continue; 1841 } 1842 /* unique */ 1843 break; 1844 } 1845 1846 ret = fs_path_add(dest, tmp, strlen(tmp)); 1847 1848 out: 1849 btrfs_free_path(path); 1850 return ret; 1851 } 1852 1853 enum inode_state { 1854 inode_state_no_change, 1855 inode_state_will_create, 1856 inode_state_did_create, 1857 inode_state_will_delete, 1858 inode_state_did_delete, 1859 }; 1860 1861 static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, 1862 u64 *send_gen, u64 *parent_gen) 1863 { 1864 int ret; 1865 int left_ret; 1866 int right_ret; 1867 u64 left_gen; 1868 u64 right_gen = 0; 1869 struct btrfs_inode_info info; 1870 1871 ret = get_inode_info(sctx->send_root, ino, &info); 1872 if (ret < 0 && ret != -ENOENT) 1873 return ret; 1874 left_ret = (info.nlink == 0) ? -ENOENT : ret; 1875 left_gen = info.gen; 1876 if (send_gen) 1877 *send_gen = ((left_ret == -ENOENT) ? 0 : info.gen); 1878 1879 if (!sctx->parent_root) { 1880 right_ret = -ENOENT; 1881 } else { 1882 ret = get_inode_info(sctx->parent_root, ino, &info); 1883 if (ret < 0 && ret != -ENOENT) 1884 return ret; 1885 right_ret = (info.nlink == 0) ? -ENOENT : ret; 1886 right_gen = info.gen; 1887 if (parent_gen) 1888 *parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen); 1889 } 1890 1891 if (!left_ret && !right_ret) { 1892 if (left_gen == gen && right_gen == gen) { 1893 ret = inode_state_no_change; 1894 } else if (left_gen == gen) { 1895 if (ino < sctx->send_progress) 1896 ret = inode_state_did_create; 1897 else 1898 ret = inode_state_will_create; 1899 } else if (right_gen == gen) { 1900 if (ino < sctx->send_progress) 1901 ret = inode_state_did_delete; 1902 else 1903 ret = inode_state_will_delete; 1904 } else { 1905 ret = -ENOENT; 1906 } 1907 } else if (!left_ret) { 1908 if (left_gen == gen) { 1909 if (ino < sctx->send_progress) 1910 ret = inode_state_did_create; 1911 else 1912 ret = inode_state_will_create; 1913 } else { 1914 ret = -ENOENT; 1915 } 1916 } else if (!right_ret) { 1917 if (right_gen == gen) { 1918 if (ino < sctx->send_progress) 1919 ret = inode_state_did_delete; 1920 else 1921 ret = inode_state_will_delete; 1922 } else { 1923 ret = -ENOENT; 1924 } 1925 } else { 1926 ret = -ENOENT; 1927 } 1928 1929 return ret; 1930 } 1931 1932 static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, 1933 u64 *send_gen, u64 *parent_gen) 1934 { 1935 int ret; 1936 1937 if (ino == BTRFS_FIRST_FREE_OBJECTID) 1938 return 1; 1939 1940 ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); 1941 if (ret < 0) 1942 return ret; 1943 1944 if (ret == inode_state_no_change || 1945 ret == inode_state_did_create || 1946 ret == inode_state_will_delete) 1947 return 1; 1948 1949 return 0; 1950 } 1951 1952 /* 1953 * Helper function to lookup a dir item in a dir. 1954 */ 1955 static int lookup_dir_item_inode(struct btrfs_root *root, 1956 u64 dir, const char *name, int name_len, 1957 u64 *found_inode) 1958 { 1959 int ret = 0; 1960 struct btrfs_dir_item *di; 1961 struct btrfs_key key; 1962 struct btrfs_path *path; 1963 struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); 1964 1965 path = alloc_path_for_send(); 1966 if (!path) 1967 return -ENOMEM; 1968 1969 di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); 1970 if (IS_ERR_OR_NULL(di)) { 1971 ret = di ? PTR_ERR(di) : -ENOENT; 1972 goto out; 1973 } 1974 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 1975 if (key.type == BTRFS_ROOT_ITEM_KEY) { 1976 ret = -ENOENT; 1977 goto out; 1978 } 1979 *found_inode = key.objectid; 1980 1981 out: 1982 btrfs_free_path(path); 1983 return ret; 1984 } 1985 1986 /* 1987 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1988 * generation of the parent dir and the name of the dir entry. 1989 */ 1990 static int get_first_ref(struct btrfs_root *root, u64 ino, 1991 u64 *dir, u64 *dir_gen, struct fs_path *name) 1992 { 1993 int ret; 1994 struct btrfs_key key; 1995 struct btrfs_key found_key; 1996 struct btrfs_path *path; 1997 int len; 1998 u64 parent_dir; 1999 2000 path = alloc_path_for_send(); 2001 if (!path) 2002 return -ENOMEM; 2003 2004 key.objectid = ino; 2005 key.type = BTRFS_INODE_REF_KEY; 2006 key.offset = 0; 2007 2008 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 2009 if (ret < 0) 2010 goto out; 2011 if (!ret) 2012 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2013 path->slots[0]); 2014 if (ret || found_key.objectid != ino || 2015 (found_key.type != BTRFS_INODE_REF_KEY && 2016 found_key.type != BTRFS_INODE_EXTREF_KEY)) { 2017 ret = -ENOENT; 2018 goto out; 2019 } 2020 2021 if (found_key.type == BTRFS_INODE_REF_KEY) { 2022 struct btrfs_inode_ref *iref; 2023 iref = btrfs_item_ptr(path->nodes[0], path->slots[0], 2024 struct btrfs_inode_ref); 2025 len = btrfs_inode_ref_name_len(path->nodes[0], iref); 2026 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 2027 (unsigned long)(iref + 1), 2028 len); 2029 parent_dir = found_key.offset; 2030 } else { 2031 struct btrfs_inode_extref *extref; 2032 extref = btrfs_item_ptr(path->nodes[0], path->slots[0], 2033 struct btrfs_inode_extref); 2034 len = btrfs_inode_extref_name_len(path->nodes[0], extref); 2035 ret = fs_path_add_from_extent_buffer(name, path->nodes[0], 2036 (unsigned long)&extref->name, len); 2037 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref); 2038 } 2039 if (ret < 0) 2040 goto out; 2041 btrfs_release_path(path); 2042 2043 if (dir_gen) { 2044 ret = get_inode_gen(root, parent_dir, dir_gen); 2045 if (ret < 0) 2046 goto out; 2047 } 2048 2049 *dir = parent_dir; 2050 2051 out: 2052 btrfs_free_path(path); 2053 return ret; 2054 } 2055 2056 static int is_first_ref(struct btrfs_root *root, 2057 u64 ino, u64 dir, 2058 const char *name, int name_len) 2059 { 2060 int ret; 2061 struct fs_path *tmp_name; 2062 u64 tmp_dir; 2063 2064 tmp_name = fs_path_alloc(); 2065 if (!tmp_name) 2066 return -ENOMEM; 2067 2068 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name); 2069 if (ret < 0) 2070 goto out; 2071 2072 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) { 2073 ret = 0; 2074 goto out; 2075 } 2076 2077 ret = !memcmp(tmp_name->start, name, name_len); 2078 2079 out: 2080 fs_path_free(tmp_name); 2081 return ret; 2082 } 2083 2084 /* 2085 * Used by process_recorded_refs to determine if a new ref would overwrite an 2086 * already existing ref. In case it detects an overwrite, it returns the 2087 * inode/gen in who_ino/who_gen. 2088 * When an overwrite is detected, process_recorded_refs does proper orphanizing 2089 * to make sure later references to the overwritten inode are possible. 2090 * Orphanizing is however only required for the first ref of an inode. 2091 * process_recorded_refs does an additional is_first_ref check to see if 2092 * orphanizing is really required. 2093 */ 2094 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, 2095 const char *name, int name_len, 2096 u64 *who_ino, u64 *who_gen, u64 *who_mode) 2097 { 2098 int ret; 2099 u64 parent_root_dir_gen; 2100 u64 other_inode = 0; 2101 struct btrfs_inode_info info; 2102 2103 if (!sctx->parent_root) 2104 return 0; 2105 2106 ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen); 2107 if (ret <= 0) 2108 return 0; 2109 2110 /* 2111 * If we have a parent root we need to verify that the parent dir was 2112 * not deleted and then re-created, if it was then we have no overwrite 2113 * and we can just unlink this entry. 2114 * 2115 * @parent_root_dir_gen was set to 0 if the inode does not exist in the 2116 * parent root. 2117 */ 2118 if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID && 2119 parent_root_dir_gen != dir_gen) 2120 return 0; 2121 2122 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, 2123 &other_inode); 2124 if (ret == -ENOENT) 2125 return 0; 2126 else if (ret < 0) 2127 return ret; 2128 2129 /* 2130 * Check if the overwritten ref was already processed. If yes, the ref 2131 * was already unlinked/moved, so we can safely assume that we will not 2132 * overwrite anything at this point in time. 2133 */ 2134 if (other_inode > sctx->send_progress || 2135 is_waiting_for_move(sctx, other_inode)) { 2136 ret = get_inode_info(sctx->parent_root, other_inode, &info); 2137 if (ret < 0) 2138 return ret; 2139 2140 *who_ino = other_inode; 2141 *who_gen = info.gen; 2142 *who_mode = info.mode; 2143 return 1; 2144 } 2145 2146 return 0; 2147 } 2148 2149 /* 2150 * Checks if the ref was overwritten by an already processed inode. This is 2151 * used by __get_cur_name_and_parent to find out if the ref was orphanized and 2152 * thus the orphan name needs be used. 2153 * process_recorded_refs also uses it to avoid unlinking of refs that were 2154 * overwritten. 2155 */ 2156 static int did_overwrite_ref(struct send_ctx *sctx, 2157 u64 dir, u64 dir_gen, 2158 u64 ino, u64 ino_gen, 2159 const char *name, int name_len) 2160 { 2161 int ret; 2162 u64 ow_inode; 2163 u64 ow_gen = 0; 2164 u64 send_root_dir_gen; 2165 2166 if (!sctx->parent_root) 2167 return 0; 2168 2169 ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL); 2170 if (ret <= 0) 2171 return ret; 2172 2173 /* 2174 * @send_root_dir_gen was set to 0 if the inode does not exist in the 2175 * send root. 2176 */ 2177 if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen) 2178 return 0; 2179 2180 /* check if the ref was overwritten by another ref */ 2181 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len, 2182 &ow_inode); 2183 if (ret == -ENOENT) { 2184 /* was never and will never be overwritten */ 2185 return 0; 2186 } else if (ret < 0) { 2187 return ret; 2188 } 2189 2190 if (ow_inode == ino) { 2191 ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); 2192 if (ret < 0) 2193 return ret; 2194 2195 /* It's the same inode, so no overwrite happened. */ 2196 if (ow_gen == ino_gen) 2197 return 0; 2198 } 2199 2200 /* 2201 * We know that it is or will be overwritten. Check this now. 2202 * The current inode being processed might have been the one that caused 2203 * inode 'ino' to be orphanized, therefore check if ow_inode matches 2204 * the current inode being processed. 2205 */ 2206 if (ow_inode < sctx->send_progress) 2207 return 1; 2208 2209 if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) { 2210 if (ow_gen == 0) { 2211 ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen); 2212 if (ret < 0) 2213 return ret; 2214 } 2215 if (ow_gen == sctx->cur_inode_gen) 2216 return 1; 2217 } 2218 2219 return 0; 2220 } 2221 2222 /* 2223 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode 2224 * that got overwritten. This is used by process_recorded_refs to determine 2225 * if it has to use the path as returned by get_cur_path or the orphan name. 2226 */ 2227 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen) 2228 { 2229 int ret = 0; 2230 struct fs_path *name = NULL; 2231 u64 dir; 2232 u64 dir_gen; 2233 2234 if (!sctx->parent_root) 2235 goto out; 2236 2237 name = fs_path_alloc(); 2238 if (!name) 2239 return -ENOMEM; 2240 2241 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name); 2242 if (ret < 0) 2243 goto out; 2244 2245 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen, 2246 name->start, fs_path_len(name)); 2247 2248 out: 2249 fs_path_free(name); 2250 return ret; 2251 } 2252 2253 static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx, 2254 u64 ino, u64 gen) 2255 { 2256 struct btrfs_lru_cache_entry *entry; 2257 2258 entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen); 2259 if (!entry) 2260 return NULL; 2261 2262 return container_of(entry, struct name_cache_entry, entry); 2263 } 2264 2265 /* 2266 * Used by get_cur_path for each ref up to the root. 2267 * Returns 0 if it succeeded. 2268 * Returns 1 if the inode is not existent or got overwritten. In that case, the 2269 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1 2270 * is returned, parent_ino/parent_gen are not guaranteed to be valid. 2271 * Returns <0 in case of error. 2272 */ 2273 static int __get_cur_name_and_parent(struct send_ctx *sctx, 2274 u64 ino, u64 gen, 2275 u64 *parent_ino, 2276 u64 *parent_gen, 2277 struct fs_path *dest) 2278 { 2279 int ret; 2280 int nce_ret; 2281 struct name_cache_entry *nce; 2282 2283 /* 2284 * First check if we already did a call to this function with the same 2285 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 2286 * return the cached result. 2287 */ 2288 nce = name_cache_search(sctx, ino, gen); 2289 if (nce) { 2290 if (ino < sctx->send_progress && nce->need_later_update) { 2291 btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry); 2292 nce = NULL; 2293 } else { 2294 *parent_ino = nce->parent_ino; 2295 *parent_gen = nce->parent_gen; 2296 ret = fs_path_add(dest, nce->name, nce->name_len); 2297 if (ret < 0) 2298 return ret; 2299 return nce->ret; 2300 } 2301 } 2302 2303 /* 2304 * If the inode is not existent yet, add the orphan name and return 1. 2305 * This should only happen for the parent dir that we determine in 2306 * record_new_ref_if_needed(). 2307 */ 2308 ret = is_inode_existent(sctx, ino, gen, NULL, NULL); 2309 if (ret < 0) 2310 return ret; 2311 2312 if (!ret) { 2313 ret = gen_unique_name(sctx, ino, gen, dest); 2314 if (ret < 0) 2315 return ret; 2316 ret = 1; 2317 goto out_cache; 2318 } 2319 2320 /* 2321 * Depending on whether the inode was already processed or not, use 2322 * send_root or parent_root for ref lookup. 2323 */ 2324 if (ino < sctx->send_progress) 2325 ret = get_first_ref(sctx->send_root, ino, 2326 parent_ino, parent_gen, dest); 2327 else 2328 ret = get_first_ref(sctx->parent_root, ino, 2329 parent_ino, parent_gen, dest); 2330 if (ret < 0) 2331 return ret; 2332 2333 /* 2334 * Check if the ref was overwritten by an inode's ref that was processed 2335 * earlier. If yes, treat as orphan and return 1. 2336 */ 2337 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, 2338 dest->start, fs_path_len(dest)); 2339 if (ret < 0) 2340 return ret; 2341 if (ret) { 2342 fs_path_reset(dest); 2343 ret = gen_unique_name(sctx, ino, gen, dest); 2344 if (ret < 0) 2345 return ret; 2346 ret = 1; 2347 } 2348 2349 out_cache: 2350 /* 2351 * Store the result of the lookup in the name cache. 2352 */ 2353 nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); 2354 if (!nce) 2355 return -ENOMEM; 2356 2357 nce->entry.key = ino; 2358 nce->entry.gen = gen; 2359 nce->parent_ino = *parent_ino; 2360 nce->parent_gen = *parent_gen; 2361 nce->name_len = fs_path_len(dest); 2362 nce->ret = ret; 2363 memcpy(nce->name, dest->start, nce->name_len); 2364 2365 if (ino < sctx->send_progress) 2366 nce->need_later_update = 0; 2367 else 2368 nce->need_later_update = 1; 2369 2370 nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); 2371 if (nce_ret < 0) { 2372 kfree(nce); 2373 return nce_ret; 2374 } 2375 2376 return ret; 2377 } 2378 2379 /* 2380 * Magic happens here. This function returns the first ref to an inode as it 2381 * would look like while receiving the stream at this point in time. 2382 * We walk the path up to the root. For every inode in between, we check if it 2383 * was already processed/sent. If yes, we continue with the parent as found 2384 * in send_root. If not, we continue with the parent as found in parent_root. 2385 * If we encounter an inode that was deleted at this point in time, we use the 2386 * inodes "orphan" name instead of the real name and stop. Same with new inodes 2387 * that were not created yet and overwritten inodes/refs. 2388 * 2389 * When do we have orphan inodes: 2390 * 1. When an inode is freshly created and thus no valid refs are available yet 2391 * 2. When a directory lost all it's refs (deleted) but still has dir items 2392 * inside which were not processed yet (pending for move/delete). If anyone 2393 * tried to get the path to the dir items, it would get a path inside that 2394 * orphan directory. 2395 * 3. When an inode is moved around or gets new links, it may overwrite the ref 2396 * of an unprocessed inode. If in that case the first ref would be 2397 * overwritten, the overwritten inode gets "orphanized". Later when we 2398 * process this overwritten inode, it is restored at a new place by moving 2399 * the orphan inode. 2400 * 2401 * sctx->send_progress tells this function at which point in time receiving 2402 * would be. 2403 */ 2404 static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, 2405 struct fs_path *dest) 2406 { 2407 int ret = 0; 2408 struct fs_path *name = NULL; 2409 u64 parent_inode = 0; 2410 u64 parent_gen = 0; 2411 int stop = 0; 2412 const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); 2413 2414 if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { 2415 if (dest != &sctx->cur_inode_path) 2416 return fs_path_copy(dest, &sctx->cur_inode_path); 2417 2418 return 0; 2419 } 2420 2421 name = fs_path_alloc(); 2422 if (!name) { 2423 ret = -ENOMEM; 2424 goto out; 2425 } 2426 2427 dest->reversed = 1; 2428 fs_path_reset(dest); 2429 2430 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2431 struct waiting_dir_move *wdm; 2432 2433 fs_path_reset(name); 2434 2435 if (is_waiting_for_rm(sctx, ino, gen)) { 2436 ret = gen_unique_name(sctx, ino, gen, name); 2437 if (ret < 0) 2438 goto out; 2439 ret = fs_path_add_path(dest, name); 2440 break; 2441 } 2442 2443 wdm = get_waiting_dir_move(sctx, ino); 2444 if (wdm && wdm->orphanized) { 2445 ret = gen_unique_name(sctx, ino, gen, name); 2446 stop = 1; 2447 } else if (wdm) { 2448 ret = get_first_ref(sctx->parent_root, ino, 2449 &parent_inode, &parent_gen, name); 2450 } else { 2451 ret = __get_cur_name_and_parent(sctx, ino, gen, 2452 &parent_inode, 2453 &parent_gen, name); 2454 if (ret) 2455 stop = 1; 2456 } 2457 2458 if (ret < 0) 2459 goto out; 2460 2461 ret = fs_path_add_path(dest, name); 2462 if (ret < 0) 2463 goto out; 2464 2465 ino = parent_inode; 2466 gen = parent_gen; 2467 } 2468 2469 out: 2470 fs_path_free(name); 2471 if (!ret) { 2472 fs_path_unreverse(dest); 2473 if (is_cur_inode && dest != &sctx->cur_inode_path) 2474 ret = fs_path_copy(&sctx->cur_inode_path, dest); 2475 } 2476 2477 return ret; 2478 } 2479 2480 /* 2481 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace 2482 */ 2483 static int send_subvol_begin(struct send_ctx *sctx) 2484 { 2485 int ret; 2486 struct btrfs_root *send_root = sctx->send_root; 2487 struct btrfs_root *parent_root = sctx->parent_root; 2488 struct btrfs_path *path; 2489 struct btrfs_key key; 2490 struct btrfs_root_ref *ref; 2491 struct extent_buffer *leaf; 2492 char *name = NULL; 2493 int namelen; 2494 2495 path = btrfs_alloc_path(); 2496 if (!path) 2497 return -ENOMEM; 2498 2499 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 2500 if (!name) { 2501 btrfs_free_path(path); 2502 return -ENOMEM; 2503 } 2504 2505 key.objectid = btrfs_root_id(send_root); 2506 key.type = BTRFS_ROOT_BACKREF_KEY; 2507 key.offset = 0; 2508 2509 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, 2510 &key, path, 1, 0); 2511 if (ret < 0) 2512 goto out; 2513 if (ret) { 2514 ret = -ENOENT; 2515 goto out; 2516 } 2517 2518 leaf = path->nodes[0]; 2519 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2520 if (key.type != BTRFS_ROOT_BACKREF_KEY || 2521 key.objectid != btrfs_root_id(send_root)) { 2522 ret = -ENOENT; 2523 goto out; 2524 } 2525 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 2526 namelen = btrfs_root_ref_name_len(leaf, ref); 2527 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen); 2528 btrfs_release_path(path); 2529 2530 if (parent_root) { 2531 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); 2532 if (ret < 0) 2533 goto out; 2534 } else { 2535 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); 2536 if (ret < 0) 2537 goto out; 2538 } 2539 2540 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); 2541 2542 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) 2543 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2544 sctx->send_root->root_item.received_uuid); 2545 else 2546 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2547 sctx->send_root->root_item.uuid); 2548 2549 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2550 btrfs_root_ctransid(&sctx->send_root->root_item)); 2551 if (parent_root) { 2552 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) 2553 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2554 parent_root->root_item.received_uuid); 2555 else 2556 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2557 parent_root->root_item.uuid); 2558 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2559 btrfs_root_ctransid(&sctx->parent_root->root_item)); 2560 } 2561 2562 ret = send_cmd(sctx); 2563 2564 tlv_put_failure: 2565 out: 2566 btrfs_free_path(path); 2567 kfree(name); 2568 return ret; 2569 } 2570 2571 static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) 2572 { 2573 if (fs_path_len(&sctx->cur_inode_path) == 0) { 2574 int ret; 2575 2576 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 2577 &sctx->cur_inode_path); 2578 if (ret < 0) 2579 return ERR_PTR(ret); 2580 } 2581 2582 return &sctx->cur_inode_path; 2583 } 2584 2585 static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) 2586 { 2587 struct fs_path *path; 2588 int ret; 2589 2590 if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) 2591 return get_cur_inode_path(sctx); 2592 2593 path = fs_path_alloc(); 2594 if (!path) 2595 return ERR_PTR(-ENOMEM); 2596 2597 ret = get_cur_path(sctx, ino, gen, path); 2598 if (ret < 0) { 2599 fs_path_free(path); 2600 return ERR_PTR(ret); 2601 } 2602 2603 return path; 2604 } 2605 2606 static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) 2607 { 2608 if (path != &sctx->cur_inode_path) 2609 fs_path_free(path); 2610 } 2611 2612 static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) 2613 { 2614 int ret = 0; 2615 struct fs_path *p; 2616 2617 p = get_path_for_command(sctx, ino, gen); 2618 if (IS_ERR(p)) 2619 return PTR_ERR(p); 2620 2621 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); 2622 if (ret < 0) 2623 goto out; 2624 2625 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2626 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); 2627 2628 ret = send_cmd(sctx); 2629 2630 tlv_put_failure: 2631 out: 2632 free_path_for_command(sctx, p); 2633 return ret; 2634 } 2635 2636 static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) 2637 { 2638 int ret = 0; 2639 struct fs_path *p; 2640 2641 p = get_path_for_command(sctx, ino, gen); 2642 if (IS_ERR(p)) 2643 return PTR_ERR(p); 2644 2645 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); 2646 if (ret < 0) 2647 goto out; 2648 2649 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2650 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); 2651 2652 ret = send_cmd(sctx); 2653 2654 tlv_put_failure: 2655 out: 2656 free_path_for_command(sctx, p); 2657 return ret; 2658 } 2659 2660 static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) 2661 { 2662 int ret = 0; 2663 struct fs_path *p; 2664 2665 if (sctx->proto < 2) 2666 return 0; 2667 2668 p = get_path_for_command(sctx, ino, gen); 2669 if (IS_ERR(p)) 2670 return PTR_ERR(p); 2671 2672 ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); 2673 if (ret < 0) 2674 goto out; 2675 2676 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2677 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); 2678 2679 ret = send_cmd(sctx); 2680 2681 tlv_put_failure: 2682 out: 2683 free_path_for_command(sctx, p); 2684 return ret; 2685 } 2686 2687 static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) 2688 { 2689 int ret = 0; 2690 struct fs_path *p; 2691 2692 p = get_path_for_command(sctx, ino, gen); 2693 if (IS_ERR(p)) 2694 return PTR_ERR(p); 2695 2696 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); 2697 if (ret < 0) 2698 goto out; 2699 2700 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2701 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); 2702 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); 2703 2704 ret = send_cmd(sctx); 2705 2706 tlv_put_failure: 2707 out: 2708 free_path_for_command(sctx, p); 2709 return ret; 2710 } 2711 2712 static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) 2713 { 2714 int ret = 0; 2715 struct fs_path *p = NULL; 2716 struct btrfs_inode_item *ii; 2717 struct btrfs_path *path = NULL; 2718 struct extent_buffer *eb; 2719 struct btrfs_key key; 2720 int slot; 2721 2722 p = get_path_for_command(sctx, ino, gen); 2723 if (IS_ERR(p)) 2724 return PTR_ERR(p); 2725 2726 path = alloc_path_for_send(); 2727 if (!path) { 2728 ret = -ENOMEM; 2729 goto out; 2730 } 2731 2732 key.objectid = ino; 2733 key.type = BTRFS_INODE_ITEM_KEY; 2734 key.offset = 0; 2735 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2736 if (ret > 0) 2737 ret = -ENOENT; 2738 if (ret < 0) 2739 goto out; 2740 2741 eb = path->nodes[0]; 2742 slot = path->slots[0]; 2743 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 2744 2745 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES); 2746 if (ret < 0) 2747 goto out; 2748 2749 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2750 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); 2751 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); 2752 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); 2753 if (sctx->proto >= 2) 2754 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); 2755 2756 ret = send_cmd(sctx); 2757 2758 tlv_put_failure: 2759 out: 2760 free_path_for_command(sctx, p); 2761 btrfs_free_path(path); 2762 return ret; 2763 } 2764 2765 /* 2766 * If the cache is full, we can't remove entries from it and do a call to 2767 * send_utimes() for each respective inode, because we might be finishing 2768 * processing an inode that is a directory and it just got renamed, and existing 2769 * entries in the cache may refer to inodes that have the directory in their 2770 * full path - in which case we would generate outdated paths (pre-rename) 2771 * for the inodes that the cache entries point to. Instead of prunning the 2772 * cache when inserting, do it after we finish processing each inode at 2773 * finish_inode_if_needed(). 2774 */ 2775 static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen) 2776 { 2777 struct btrfs_lru_cache_entry *entry; 2778 int ret; 2779 2780 entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen); 2781 if (entry != NULL) 2782 return 0; 2783 2784 /* Caching is optional, don't fail if we can't allocate memory. */ 2785 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2786 if (!entry) 2787 return send_utimes(sctx, dir, gen); 2788 2789 entry->key = dir; 2790 entry->gen = gen; 2791 2792 ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL); 2793 ASSERT(ret != -EEXIST); 2794 if (ret) { 2795 kfree(entry); 2796 return send_utimes(sctx, dir, gen); 2797 } 2798 2799 return 0; 2800 } 2801 2802 static int trim_dir_utimes_cache(struct send_ctx *sctx) 2803 { 2804 while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) { 2805 struct btrfs_lru_cache_entry *lru; 2806 int ret; 2807 2808 lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache); 2809 ASSERT(lru != NULL); 2810 2811 ret = send_utimes(sctx, lru->key, lru->gen); 2812 if (ret) 2813 return ret; 2814 2815 btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru); 2816 } 2817 2818 return 0; 2819 } 2820 2821 /* 2822 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have 2823 * a valid path yet because we did not process the refs yet. So, the inode 2824 * is created as orphan. 2825 */ 2826 static int send_create_inode(struct send_ctx *sctx, u64 ino) 2827 { 2828 int ret = 0; 2829 struct fs_path *p; 2830 int cmd; 2831 struct btrfs_inode_info info; 2832 u64 gen; 2833 u64 mode; 2834 u64 rdev; 2835 2836 p = fs_path_alloc(); 2837 if (!p) 2838 return -ENOMEM; 2839 2840 if (ino != sctx->cur_ino) { 2841 ret = get_inode_info(sctx->send_root, ino, &info); 2842 if (ret < 0) 2843 goto out; 2844 gen = info.gen; 2845 mode = info.mode; 2846 rdev = info.rdev; 2847 } else { 2848 gen = sctx->cur_inode_gen; 2849 mode = sctx->cur_inode_mode; 2850 rdev = sctx->cur_inode_rdev; 2851 } 2852 2853 if (S_ISREG(mode)) { 2854 cmd = BTRFS_SEND_C_MKFILE; 2855 } else if (S_ISDIR(mode)) { 2856 cmd = BTRFS_SEND_C_MKDIR; 2857 } else if (S_ISLNK(mode)) { 2858 cmd = BTRFS_SEND_C_SYMLINK; 2859 } else if (S_ISCHR(mode) || S_ISBLK(mode)) { 2860 cmd = BTRFS_SEND_C_MKNOD; 2861 } else if (S_ISFIFO(mode)) { 2862 cmd = BTRFS_SEND_C_MKFIFO; 2863 } else if (S_ISSOCK(mode)) { 2864 cmd = BTRFS_SEND_C_MKSOCK; 2865 } else { 2866 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", 2867 (int)(mode & S_IFMT)); 2868 ret = -EOPNOTSUPP; 2869 goto out; 2870 } 2871 2872 ret = begin_cmd(sctx, cmd); 2873 if (ret < 0) 2874 goto out; 2875 2876 ret = gen_unique_name(sctx, ino, gen, p); 2877 if (ret < 0) 2878 goto out; 2879 2880 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 2881 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino); 2882 2883 if (S_ISLNK(mode)) { 2884 fs_path_reset(p); 2885 ret = read_symlink(sctx->send_root, ino, p); 2886 if (ret < 0) 2887 goto out; 2888 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2889 } else if (S_ISCHR(mode) || S_ISBLK(mode) || 2890 S_ISFIFO(mode) || S_ISSOCK(mode)) { 2891 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev)); 2892 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode); 2893 } 2894 2895 ret = send_cmd(sctx); 2896 if (ret < 0) 2897 goto out; 2898 2899 2900 tlv_put_failure: 2901 out: 2902 fs_path_free(p); 2903 return ret; 2904 } 2905 2906 static void cache_dir_created(struct send_ctx *sctx, u64 dir) 2907 { 2908 struct btrfs_lru_cache_entry *entry; 2909 int ret; 2910 2911 /* Caching is optional, ignore any failures. */ 2912 entry = kmalloc(sizeof(*entry), GFP_KERNEL); 2913 if (!entry) 2914 return; 2915 2916 entry->key = dir; 2917 entry->gen = 0; 2918 ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL); 2919 if (ret < 0) 2920 kfree(entry); 2921 } 2922 2923 /* 2924 * We need some special handling for inodes that get processed before the parent 2925 * directory got created. See process_recorded_refs for details. 2926 * This function does the check if we already created the dir out of order. 2927 */ 2928 static int did_create_dir(struct send_ctx *sctx, u64 dir) 2929 { 2930 int ret = 0; 2931 int iter_ret = 0; 2932 struct btrfs_path *path = NULL; 2933 struct btrfs_key key; 2934 struct btrfs_key found_key; 2935 struct btrfs_key di_key; 2936 struct btrfs_dir_item *di; 2937 2938 if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0)) 2939 return 1; 2940 2941 path = alloc_path_for_send(); 2942 if (!path) 2943 return -ENOMEM; 2944 2945 key.objectid = dir; 2946 key.type = BTRFS_DIR_INDEX_KEY; 2947 key.offset = 0; 2948 2949 btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { 2950 struct extent_buffer *eb = path->nodes[0]; 2951 2952 if (found_key.objectid != key.objectid || 2953 found_key.type != key.type) { 2954 ret = 0; 2955 break; 2956 } 2957 2958 di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); 2959 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2960 2961 if (di_key.type != BTRFS_ROOT_ITEM_KEY && 2962 di_key.objectid < sctx->send_progress) { 2963 ret = 1; 2964 cache_dir_created(sctx, dir); 2965 break; 2966 } 2967 } 2968 /* Catch error found during iteration */ 2969 if (iter_ret < 0) 2970 ret = iter_ret; 2971 2972 btrfs_free_path(path); 2973 return ret; 2974 } 2975 2976 /* 2977 * Only creates the inode if it is: 2978 * 1. Not a directory 2979 * 2. Or a directory which was not created already due to out of order 2980 * directories. See did_create_dir and process_recorded_refs for details. 2981 */ 2982 static int send_create_inode_if_needed(struct send_ctx *sctx) 2983 { 2984 int ret; 2985 2986 if (S_ISDIR(sctx->cur_inode_mode)) { 2987 ret = did_create_dir(sctx, sctx->cur_ino); 2988 if (ret < 0) 2989 return ret; 2990 else if (ret > 0) 2991 return 0; 2992 } 2993 2994 ret = send_create_inode(sctx, sctx->cur_ino); 2995 2996 if (ret == 0 && S_ISDIR(sctx->cur_inode_mode)) 2997 cache_dir_created(sctx, sctx->cur_ino); 2998 2999 return ret; 3000 } 3001 3002 struct recorded_ref { 3003 struct list_head list; 3004 char *name; 3005 struct fs_path *full_path; 3006 u64 dir; 3007 u64 dir_gen; 3008 int name_len; 3009 struct rb_node node; 3010 struct rb_root *root; 3011 }; 3012 3013 static struct recorded_ref *recorded_ref_alloc(void) 3014 { 3015 struct recorded_ref *ref; 3016 3017 ref = kzalloc(sizeof(*ref), GFP_KERNEL); 3018 if (!ref) 3019 return NULL; 3020 RB_CLEAR_NODE(&ref->node); 3021 INIT_LIST_HEAD(&ref->list); 3022 return ref; 3023 } 3024 3025 static void recorded_ref_free(struct recorded_ref *ref) 3026 { 3027 if (!ref) 3028 return; 3029 if (!RB_EMPTY_NODE(&ref->node)) 3030 rb_erase(&ref->node, ref->root); 3031 list_del(&ref->list); 3032 fs_path_free(ref->full_path); 3033 kfree(ref); 3034 } 3035 3036 static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) 3037 { 3038 ref->full_path = path; 3039 ref->name = (char *)kbasename(ref->full_path->start); 3040 ref->name_len = ref->full_path->end - ref->name; 3041 } 3042 3043 static int dup_ref(struct recorded_ref *ref, struct list_head *list) 3044 { 3045 struct recorded_ref *new; 3046 3047 new = recorded_ref_alloc(); 3048 if (!new) 3049 return -ENOMEM; 3050 3051 new->dir = ref->dir; 3052 new->dir_gen = ref->dir_gen; 3053 list_add_tail(&new->list, list); 3054 return 0; 3055 } 3056 3057 static void __free_recorded_refs(struct list_head *head) 3058 { 3059 struct recorded_ref *cur; 3060 3061 while (!list_empty(head)) { 3062 cur = list_first_entry(head, struct recorded_ref, list); 3063 recorded_ref_free(cur); 3064 } 3065 } 3066 3067 static void free_recorded_refs(struct send_ctx *sctx) 3068 { 3069 __free_recorded_refs(&sctx->new_refs); 3070 __free_recorded_refs(&sctx->deleted_refs); 3071 } 3072 3073 /* 3074 * Renames/moves a file/dir to its orphan name. Used when the first 3075 * ref of an unprocessed inode gets overwritten and for all non empty 3076 * directories. 3077 */ 3078 static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, 3079 struct fs_path *path) 3080 { 3081 int ret; 3082 struct fs_path *orphan; 3083 3084 orphan = fs_path_alloc(); 3085 if (!orphan) 3086 return -ENOMEM; 3087 3088 ret = gen_unique_name(sctx, ino, gen, orphan); 3089 if (ret < 0) 3090 goto out; 3091 3092 ret = send_rename(sctx, path, orphan); 3093 if (ret < 0) 3094 goto out; 3095 3096 if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) 3097 ret = fs_path_copy(&sctx->cur_inode_path, orphan); 3098 3099 out: 3100 fs_path_free(orphan); 3101 return ret; 3102 } 3103 3104 static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, 3105 u64 dir_ino, u64 dir_gen) 3106 { 3107 struct rb_node **p = &sctx->orphan_dirs.rb_node; 3108 struct rb_node *parent = NULL; 3109 struct orphan_dir_info *entry, *odi; 3110 3111 while (*p) { 3112 parent = *p; 3113 entry = rb_entry(parent, struct orphan_dir_info, node); 3114 if (dir_ino < entry->ino) 3115 p = &(*p)->rb_left; 3116 else if (dir_ino > entry->ino) 3117 p = &(*p)->rb_right; 3118 else if (dir_gen < entry->gen) 3119 p = &(*p)->rb_left; 3120 else if (dir_gen > entry->gen) 3121 p = &(*p)->rb_right; 3122 else 3123 return entry; 3124 } 3125 3126 odi = kmalloc(sizeof(*odi), GFP_KERNEL); 3127 if (!odi) 3128 return ERR_PTR(-ENOMEM); 3129 odi->ino = dir_ino; 3130 odi->gen = dir_gen; 3131 odi->last_dir_index_offset = 0; 3132 odi->dir_high_seq_ino = 0; 3133 3134 rb_link_node(&odi->node, parent, p); 3135 rb_insert_color(&odi->node, &sctx->orphan_dirs); 3136 return odi; 3137 } 3138 3139 static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, 3140 u64 dir_ino, u64 gen) 3141 { 3142 struct rb_node *n = sctx->orphan_dirs.rb_node; 3143 struct orphan_dir_info *entry; 3144 3145 while (n) { 3146 entry = rb_entry(n, struct orphan_dir_info, node); 3147 if (dir_ino < entry->ino) 3148 n = n->rb_left; 3149 else if (dir_ino > entry->ino) 3150 n = n->rb_right; 3151 else if (gen < entry->gen) 3152 n = n->rb_left; 3153 else if (gen > entry->gen) 3154 n = n->rb_right; 3155 else 3156 return entry; 3157 } 3158 return NULL; 3159 } 3160 3161 static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) 3162 { 3163 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); 3164 3165 return odi != NULL; 3166 } 3167 3168 static void free_orphan_dir_info(struct send_ctx *sctx, 3169 struct orphan_dir_info *odi) 3170 { 3171 if (!odi) 3172 return; 3173 rb_erase(&odi->node, &sctx->orphan_dirs); 3174 kfree(odi); 3175 } 3176 3177 /* 3178 * Returns 1 if a directory can be removed at this point in time. 3179 * We check this by iterating all dir items and checking if the inode behind 3180 * the dir item was already processed. 3181 */ 3182 static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen) 3183 { 3184 int ret = 0; 3185 int iter_ret = 0; 3186 struct btrfs_root *root = sctx->parent_root; 3187 struct btrfs_path *path; 3188 struct btrfs_key key; 3189 struct btrfs_key found_key; 3190 struct btrfs_key loc; 3191 struct btrfs_dir_item *di; 3192 struct orphan_dir_info *odi = NULL; 3193 u64 dir_high_seq_ino = 0; 3194 u64 last_dir_index_offset = 0; 3195 3196 /* 3197 * Don't try to rmdir the top/root subvolume dir. 3198 */ 3199 if (dir == BTRFS_FIRST_FREE_OBJECTID) 3200 return 0; 3201 3202 odi = get_orphan_dir_info(sctx, dir, dir_gen); 3203 if (odi && sctx->cur_ino < odi->dir_high_seq_ino) 3204 return 0; 3205 3206 path = alloc_path_for_send(); 3207 if (!path) 3208 return -ENOMEM; 3209 3210 if (!odi) { 3211 /* 3212 * Find the inode number associated with the last dir index 3213 * entry. This is very likely the inode with the highest number 3214 * of all inodes that have an entry in the directory. We can 3215 * then use it to avoid future calls to can_rmdir(), when 3216 * processing inodes with a lower number, from having to search 3217 * the parent root b+tree for dir index keys. 3218 */ 3219 key.objectid = dir; 3220 key.type = BTRFS_DIR_INDEX_KEY; 3221 key.offset = (u64)-1; 3222 3223 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3224 if (ret < 0) { 3225 goto out; 3226 } else if (ret > 0) { 3227 /* Can't happen, the root is never empty. */ 3228 ASSERT(path->slots[0] > 0); 3229 if (WARN_ON(path->slots[0] == 0)) { 3230 ret = -EUCLEAN; 3231 goto out; 3232 } 3233 path->slots[0]--; 3234 } 3235 3236 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3237 if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) { 3238 /* No index keys, dir can be removed. */ 3239 ret = 1; 3240 goto out; 3241 } 3242 3243 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 3244 struct btrfs_dir_item); 3245 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 3246 dir_high_seq_ino = loc.objectid; 3247 if (sctx->cur_ino < dir_high_seq_ino) { 3248 ret = 0; 3249 goto out; 3250 } 3251 3252 btrfs_release_path(path); 3253 } 3254 3255 key.objectid = dir; 3256 key.type = BTRFS_DIR_INDEX_KEY; 3257 key.offset = (odi ? odi->last_dir_index_offset : 0); 3258 3259 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 3260 struct waiting_dir_move *dm; 3261 3262 if (found_key.objectid != key.objectid || 3263 found_key.type != key.type) 3264 break; 3265 3266 di = btrfs_item_ptr(path->nodes[0], path->slots[0], 3267 struct btrfs_dir_item); 3268 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc); 3269 3270 dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid); 3271 last_dir_index_offset = found_key.offset; 3272 3273 dm = get_waiting_dir_move(sctx, loc.objectid); 3274 if (dm) { 3275 dm->rmdir_ino = dir; 3276 dm->rmdir_gen = dir_gen; 3277 ret = 0; 3278 goto out; 3279 } 3280 3281 if (loc.objectid > sctx->cur_ino) { 3282 ret = 0; 3283 goto out; 3284 } 3285 } 3286 if (iter_ret < 0) { 3287 ret = iter_ret; 3288 goto out; 3289 } 3290 free_orphan_dir_info(sctx, odi); 3291 3292 ret = 1; 3293 3294 out: 3295 btrfs_free_path(path); 3296 3297 if (ret) 3298 return ret; 3299 3300 if (!odi) { 3301 odi = add_orphan_dir_info(sctx, dir, dir_gen); 3302 if (IS_ERR(odi)) 3303 return PTR_ERR(odi); 3304 3305 odi->gen = dir_gen; 3306 } 3307 3308 odi->last_dir_index_offset = last_dir_index_offset; 3309 odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino); 3310 3311 return 0; 3312 } 3313 3314 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) 3315 { 3316 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino); 3317 3318 return entry != NULL; 3319 } 3320 3321 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) 3322 { 3323 struct rb_node **p = &sctx->waiting_dir_moves.rb_node; 3324 struct rb_node *parent = NULL; 3325 struct waiting_dir_move *entry, *dm; 3326 3327 dm = kmalloc(sizeof(*dm), GFP_KERNEL); 3328 if (!dm) 3329 return -ENOMEM; 3330 dm->ino = ino; 3331 dm->rmdir_ino = 0; 3332 dm->rmdir_gen = 0; 3333 dm->orphanized = orphanized; 3334 3335 while (*p) { 3336 parent = *p; 3337 entry = rb_entry(parent, struct waiting_dir_move, node); 3338 if (ino < entry->ino) { 3339 p = &(*p)->rb_left; 3340 } else if (ino > entry->ino) { 3341 p = &(*p)->rb_right; 3342 } else { 3343 kfree(dm); 3344 return -EEXIST; 3345 } 3346 } 3347 3348 rb_link_node(&dm->node, parent, p); 3349 rb_insert_color(&dm->node, &sctx->waiting_dir_moves); 3350 return 0; 3351 } 3352 3353 static struct waiting_dir_move * 3354 get_waiting_dir_move(struct send_ctx *sctx, u64 ino) 3355 { 3356 struct rb_node *n = sctx->waiting_dir_moves.rb_node; 3357 struct waiting_dir_move *entry; 3358 3359 while (n) { 3360 entry = rb_entry(n, struct waiting_dir_move, node); 3361 if (ino < entry->ino) 3362 n = n->rb_left; 3363 else if (ino > entry->ino) 3364 n = n->rb_right; 3365 else 3366 return entry; 3367 } 3368 return NULL; 3369 } 3370 3371 static void free_waiting_dir_move(struct send_ctx *sctx, 3372 struct waiting_dir_move *dm) 3373 { 3374 if (!dm) 3375 return; 3376 rb_erase(&dm->node, &sctx->waiting_dir_moves); 3377 kfree(dm); 3378 } 3379 3380 static int add_pending_dir_move(struct send_ctx *sctx, 3381 u64 ino, 3382 u64 ino_gen, 3383 u64 parent_ino, 3384 struct list_head *new_refs, 3385 struct list_head *deleted_refs, 3386 const bool is_orphan) 3387 { 3388 struct rb_node **p = &sctx->pending_dir_moves.rb_node; 3389 struct rb_node *parent = NULL; 3390 struct pending_dir_move *entry = NULL, *pm; 3391 struct recorded_ref *cur; 3392 int exists = 0; 3393 int ret; 3394 3395 pm = kmalloc(sizeof(*pm), GFP_KERNEL); 3396 if (!pm) 3397 return -ENOMEM; 3398 pm->parent_ino = parent_ino; 3399 pm->ino = ino; 3400 pm->gen = ino_gen; 3401 INIT_LIST_HEAD(&pm->list); 3402 INIT_LIST_HEAD(&pm->update_refs); 3403 RB_CLEAR_NODE(&pm->node); 3404 3405 while (*p) { 3406 parent = *p; 3407 entry = rb_entry(parent, struct pending_dir_move, node); 3408 if (parent_ino < entry->parent_ino) { 3409 p = &(*p)->rb_left; 3410 } else if (parent_ino > entry->parent_ino) { 3411 p = &(*p)->rb_right; 3412 } else { 3413 exists = 1; 3414 break; 3415 } 3416 } 3417 3418 list_for_each_entry(cur, deleted_refs, list) { 3419 ret = dup_ref(cur, &pm->update_refs); 3420 if (ret < 0) 3421 goto out; 3422 } 3423 list_for_each_entry(cur, new_refs, list) { 3424 ret = dup_ref(cur, &pm->update_refs); 3425 if (ret < 0) 3426 goto out; 3427 } 3428 3429 ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); 3430 if (ret) 3431 goto out; 3432 3433 if (exists) { 3434 list_add_tail(&pm->list, &entry->list); 3435 } else { 3436 rb_link_node(&pm->node, parent, p); 3437 rb_insert_color(&pm->node, &sctx->pending_dir_moves); 3438 } 3439 ret = 0; 3440 out: 3441 if (ret) { 3442 __free_recorded_refs(&pm->update_refs); 3443 kfree(pm); 3444 } 3445 return ret; 3446 } 3447 3448 static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, 3449 u64 parent_ino) 3450 { 3451 struct rb_node *n = sctx->pending_dir_moves.rb_node; 3452 struct pending_dir_move *entry; 3453 3454 while (n) { 3455 entry = rb_entry(n, struct pending_dir_move, node); 3456 if (parent_ino < entry->parent_ino) 3457 n = n->rb_left; 3458 else if (parent_ino > entry->parent_ino) 3459 n = n->rb_right; 3460 else 3461 return entry; 3462 } 3463 return NULL; 3464 } 3465 3466 static int path_loop(struct send_ctx *sctx, struct fs_path *name, 3467 u64 ino, u64 gen, u64 *ancestor_ino) 3468 { 3469 int ret = 0; 3470 u64 parent_inode = 0; 3471 u64 parent_gen = 0; 3472 u64 start_ino = ino; 3473 3474 *ancestor_ino = 0; 3475 while (ino != BTRFS_FIRST_FREE_OBJECTID) { 3476 fs_path_reset(name); 3477 3478 if (is_waiting_for_rm(sctx, ino, gen)) 3479 break; 3480 if (is_waiting_for_move(sctx, ino)) { 3481 if (*ancestor_ino == 0) 3482 *ancestor_ino = ino; 3483 ret = get_first_ref(sctx->parent_root, ino, 3484 &parent_inode, &parent_gen, name); 3485 } else { 3486 ret = __get_cur_name_and_parent(sctx, ino, gen, 3487 &parent_inode, 3488 &parent_gen, name); 3489 if (ret > 0) { 3490 ret = 0; 3491 break; 3492 } 3493 } 3494 if (ret < 0) 3495 break; 3496 if (parent_inode == start_ino) { 3497 ret = 1; 3498 if (*ancestor_ino == 0) 3499 *ancestor_ino = ino; 3500 break; 3501 } 3502 ino = parent_inode; 3503 gen = parent_gen; 3504 } 3505 return ret; 3506 } 3507 3508 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3509 { 3510 struct fs_path *from_path = NULL; 3511 struct fs_path *to_path = NULL; 3512 struct fs_path *name = NULL; 3513 u64 orig_progress = sctx->send_progress; 3514 struct recorded_ref *cur; 3515 u64 parent_ino, parent_gen; 3516 struct waiting_dir_move *dm = NULL; 3517 u64 rmdir_ino = 0; 3518 u64 rmdir_gen; 3519 u64 ancestor; 3520 bool is_orphan; 3521 int ret; 3522 3523 name = fs_path_alloc(); 3524 from_path = fs_path_alloc(); 3525 if (!name || !from_path) { 3526 ret = -ENOMEM; 3527 goto out; 3528 } 3529 3530 dm = get_waiting_dir_move(sctx, pm->ino); 3531 ASSERT(dm); 3532 rmdir_ino = dm->rmdir_ino; 3533 rmdir_gen = dm->rmdir_gen; 3534 is_orphan = dm->orphanized; 3535 free_waiting_dir_move(sctx, dm); 3536 3537 if (is_orphan) { 3538 ret = gen_unique_name(sctx, pm->ino, 3539 pm->gen, from_path); 3540 } else { 3541 ret = get_first_ref(sctx->parent_root, pm->ino, 3542 &parent_ino, &parent_gen, name); 3543 if (ret < 0) 3544 goto out; 3545 ret = get_cur_path(sctx, parent_ino, parent_gen, 3546 from_path); 3547 if (ret < 0) 3548 goto out; 3549 ret = fs_path_add_path(from_path, name); 3550 } 3551 if (ret < 0) 3552 goto out; 3553 3554 sctx->send_progress = sctx->cur_ino + 1; 3555 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); 3556 if (ret < 0) 3557 goto out; 3558 if (ret) { 3559 LIST_HEAD(deleted_refs); 3560 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); 3561 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, 3562 &pm->update_refs, &deleted_refs, 3563 is_orphan); 3564 if (ret < 0) 3565 goto out; 3566 if (rmdir_ino) { 3567 dm = get_waiting_dir_move(sctx, pm->ino); 3568 ASSERT(dm); 3569 dm->rmdir_ino = rmdir_ino; 3570 dm->rmdir_gen = rmdir_gen; 3571 } 3572 goto out; 3573 } 3574 fs_path_reset(name); 3575 to_path = name; 3576 name = NULL; 3577 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path); 3578 if (ret < 0) 3579 goto out; 3580 3581 ret = send_rename(sctx, from_path, to_path); 3582 if (ret < 0) 3583 goto out; 3584 3585 if (rmdir_ino) { 3586 struct orphan_dir_info *odi; 3587 u64 gen; 3588 3589 odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); 3590 if (!odi) { 3591 /* already deleted */ 3592 goto finish; 3593 } 3594 gen = odi->gen; 3595 3596 ret = can_rmdir(sctx, rmdir_ino, gen); 3597 if (ret < 0) 3598 goto out; 3599 if (!ret) 3600 goto finish; 3601 3602 name = fs_path_alloc(); 3603 if (!name) { 3604 ret = -ENOMEM; 3605 goto out; 3606 } 3607 ret = get_cur_path(sctx, rmdir_ino, gen, name); 3608 if (ret < 0) 3609 goto out; 3610 ret = send_rmdir(sctx, name); 3611 if (ret < 0) 3612 goto out; 3613 } 3614 3615 finish: 3616 ret = cache_dir_utimes(sctx, pm->ino, pm->gen); 3617 if (ret < 0) 3618 goto out; 3619 3620 /* 3621 * After rename/move, need to update the utimes of both new parent(s) 3622 * and old parent(s). 3623 */ 3624 list_for_each_entry(cur, &pm->update_refs, list) { 3625 /* 3626 * The parent inode might have been deleted in the send snapshot 3627 */ 3628 ret = get_inode_info(sctx->send_root, cur->dir, NULL); 3629 if (ret == -ENOENT) { 3630 ret = 0; 3631 continue; 3632 } 3633 if (ret < 0) 3634 goto out; 3635 3636 ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); 3637 if (ret < 0) 3638 goto out; 3639 } 3640 3641 out: 3642 fs_path_free(name); 3643 fs_path_free(from_path); 3644 fs_path_free(to_path); 3645 sctx->send_progress = orig_progress; 3646 3647 return ret; 3648 } 3649 3650 static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m) 3651 { 3652 if (!list_empty(&m->list)) 3653 list_del(&m->list); 3654 if (!RB_EMPTY_NODE(&m->node)) 3655 rb_erase(&m->node, &sctx->pending_dir_moves); 3656 __free_recorded_refs(&m->update_refs); 3657 kfree(m); 3658 } 3659 3660 static void tail_append_pending_moves(struct send_ctx *sctx, 3661 struct pending_dir_move *moves, 3662 struct list_head *stack) 3663 { 3664 if (list_empty(&moves->list)) { 3665 list_add_tail(&moves->list, stack); 3666 } else { 3667 LIST_HEAD(list); 3668 list_splice_init(&moves->list, &list); 3669 list_add_tail(&moves->list, stack); 3670 list_splice_tail(&list, stack); 3671 } 3672 if (!RB_EMPTY_NODE(&moves->node)) { 3673 rb_erase(&moves->node, &sctx->pending_dir_moves); 3674 RB_CLEAR_NODE(&moves->node); 3675 } 3676 } 3677 3678 static int apply_children_dir_moves(struct send_ctx *sctx) 3679 { 3680 struct pending_dir_move *pm; 3681 LIST_HEAD(stack); 3682 u64 parent_ino = sctx->cur_ino; 3683 int ret = 0; 3684 3685 pm = get_pending_dir_moves(sctx, parent_ino); 3686 if (!pm) 3687 return 0; 3688 3689 tail_append_pending_moves(sctx, pm, &stack); 3690 3691 while (!list_empty(&stack)) { 3692 pm = list_first_entry(&stack, struct pending_dir_move, list); 3693 parent_ino = pm->ino; 3694 ret = apply_dir_move(sctx, pm); 3695 free_pending_move(sctx, pm); 3696 if (ret) 3697 goto out; 3698 pm = get_pending_dir_moves(sctx, parent_ino); 3699 if (pm) 3700 tail_append_pending_moves(sctx, pm, &stack); 3701 } 3702 return 0; 3703 3704 out: 3705 while (!list_empty(&stack)) { 3706 pm = list_first_entry(&stack, struct pending_dir_move, list); 3707 free_pending_move(sctx, pm); 3708 } 3709 return ret; 3710 } 3711 3712 /* 3713 * We might need to delay a directory rename even when no ancestor directory 3714 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was 3715 * renamed. This happens when we rename a directory to the old name (the name 3716 * in the parent root) of some other unrelated directory that got its rename 3717 * delayed due to some ancestor with higher number that got renamed. 3718 * 3719 * Example: 3720 * 3721 * Parent snapshot: 3722 * . (ino 256) 3723 * |---- a/ (ino 257) 3724 * | |---- file (ino 260) 3725 * | 3726 * |---- b/ (ino 258) 3727 * |---- c/ (ino 259) 3728 * 3729 * Send snapshot: 3730 * . (ino 256) 3731 * |---- a/ (ino 258) 3732 * |---- x/ (ino 259) 3733 * |---- y/ (ino 257) 3734 * |----- file (ino 260) 3735 * 3736 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 3737 * from 'a' to 'x/y' happening first, which in turn depends on the rename of 3738 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream 3739 * must issue is: 3740 * 3741 * 1 - rename 259 from 'c' to 'x' 3742 * 2 - rename 257 from 'a' to 'x/y' 3743 * 3 - rename 258 from 'b' to 'a' 3744 * 3745 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can 3746 * be done right away and < 0 on error. 3747 */ 3748 static int wait_for_dest_dir_move(struct send_ctx *sctx, 3749 struct recorded_ref *parent_ref, 3750 const bool is_orphan) 3751 { 3752 struct btrfs_path *path; 3753 struct btrfs_key key; 3754 struct btrfs_key di_key; 3755 struct btrfs_dir_item *di; 3756 u64 left_gen; 3757 u64 right_gen; 3758 int ret = 0; 3759 struct waiting_dir_move *wdm; 3760 3761 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) 3762 return 0; 3763 3764 path = alloc_path_for_send(); 3765 if (!path) 3766 return -ENOMEM; 3767 3768 key.objectid = parent_ref->dir; 3769 key.type = BTRFS_DIR_ITEM_KEY; 3770 key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); 3771 3772 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); 3773 if (ret < 0) { 3774 goto out; 3775 } else if (ret > 0) { 3776 ret = 0; 3777 goto out; 3778 } 3779 3780 di = btrfs_match_dir_item_name(path, parent_ref->name, 3781 parent_ref->name_len); 3782 if (!di) { 3783 ret = 0; 3784 goto out; 3785 } 3786 /* 3787 * di_key.objectid has the number of the inode that has a dentry in the 3788 * parent directory with the same name that sctx->cur_ino is being 3789 * renamed to. We need to check if that inode is in the send root as 3790 * well and if it is currently marked as an inode with a pending rename, 3791 * if it is, we need to delay the rename of sctx->cur_ino as well, so 3792 * that it happens after that other inode is renamed. 3793 */ 3794 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); 3795 if (di_key.type != BTRFS_INODE_ITEM_KEY) { 3796 ret = 0; 3797 goto out; 3798 } 3799 3800 ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen); 3801 if (ret < 0) 3802 goto out; 3803 ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen); 3804 if (ret < 0) { 3805 if (ret == -ENOENT) 3806 ret = 0; 3807 goto out; 3808 } 3809 3810 /* Different inode, no need to delay the rename of sctx->cur_ino */ 3811 if (right_gen != left_gen) { 3812 ret = 0; 3813 goto out; 3814 } 3815 3816 wdm = get_waiting_dir_move(sctx, di_key.objectid); 3817 if (wdm && !wdm->orphanized) { 3818 ret = add_pending_dir_move(sctx, 3819 sctx->cur_ino, 3820 sctx->cur_inode_gen, 3821 di_key.objectid, 3822 &sctx->new_refs, 3823 &sctx->deleted_refs, 3824 is_orphan); 3825 if (!ret) 3826 ret = 1; 3827 } 3828 out: 3829 btrfs_free_path(path); 3830 return ret; 3831 } 3832 3833 /* 3834 * Check if inode ino2, or any of its ancestors, is inode ino1. 3835 * Return 1 if true, 0 if false and < 0 on error. 3836 */ 3837 static int check_ino_in_path(struct btrfs_root *root, 3838 const u64 ino1, 3839 const u64 ino1_gen, 3840 const u64 ino2, 3841 const u64 ino2_gen, 3842 struct fs_path *fs_path) 3843 { 3844 u64 ino = ino2; 3845 3846 if (ino1 == ino2) 3847 return ino1_gen == ino2_gen; 3848 3849 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3850 u64 parent; 3851 u64 parent_gen; 3852 int ret; 3853 3854 fs_path_reset(fs_path); 3855 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); 3856 if (ret < 0) 3857 return ret; 3858 if (parent == ino1) 3859 return parent_gen == ino1_gen; 3860 ino = parent; 3861 } 3862 return 0; 3863 } 3864 3865 /* 3866 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any 3867 * possible path (in case ino2 is not a directory and has multiple hard links). 3868 * Return 1 if true, 0 if false and < 0 on error. 3869 */ 3870 static int is_ancestor(struct btrfs_root *root, 3871 const u64 ino1, 3872 const u64 ino1_gen, 3873 const u64 ino2, 3874 struct fs_path *fs_path) 3875 { 3876 bool free_fs_path = false; 3877 int ret = 0; 3878 int iter_ret = 0; 3879 struct btrfs_path *path = NULL; 3880 struct btrfs_key key; 3881 3882 if (!fs_path) { 3883 fs_path = fs_path_alloc(); 3884 if (!fs_path) 3885 return -ENOMEM; 3886 free_fs_path = true; 3887 } 3888 3889 path = alloc_path_for_send(); 3890 if (!path) { 3891 ret = -ENOMEM; 3892 goto out; 3893 } 3894 3895 key.objectid = ino2; 3896 key.type = BTRFS_INODE_REF_KEY; 3897 key.offset = 0; 3898 3899 btrfs_for_each_slot(root, &key, &key, path, iter_ret) { 3900 struct extent_buffer *leaf = path->nodes[0]; 3901 int slot = path->slots[0]; 3902 u32 cur_offset = 0; 3903 u32 item_size; 3904 3905 if (key.objectid != ino2) 3906 break; 3907 if (key.type != BTRFS_INODE_REF_KEY && 3908 key.type != BTRFS_INODE_EXTREF_KEY) 3909 break; 3910 3911 item_size = btrfs_item_size(leaf, slot); 3912 while (cur_offset < item_size) { 3913 u64 parent; 3914 u64 parent_gen; 3915 3916 if (key.type == BTRFS_INODE_EXTREF_KEY) { 3917 unsigned long ptr; 3918 struct btrfs_inode_extref *extref; 3919 3920 ptr = btrfs_item_ptr_offset(leaf, slot); 3921 extref = (struct btrfs_inode_extref *) 3922 (ptr + cur_offset); 3923 parent = btrfs_inode_extref_parent(leaf, 3924 extref); 3925 cur_offset += sizeof(*extref); 3926 cur_offset += btrfs_inode_extref_name_len(leaf, 3927 extref); 3928 } else { 3929 parent = key.offset; 3930 cur_offset = item_size; 3931 } 3932 3933 ret = get_inode_gen(root, parent, &parent_gen); 3934 if (ret < 0) 3935 goto out; 3936 ret = check_ino_in_path(root, ino1, ino1_gen, 3937 parent, parent_gen, fs_path); 3938 if (ret) 3939 goto out; 3940 } 3941 } 3942 ret = 0; 3943 if (iter_ret < 0) 3944 ret = iter_ret; 3945 3946 out: 3947 btrfs_free_path(path); 3948 if (free_fs_path) 3949 fs_path_free(fs_path); 3950 return ret; 3951 } 3952 3953 static int wait_for_parent_move(struct send_ctx *sctx, 3954 struct recorded_ref *parent_ref, 3955 const bool is_orphan) 3956 { 3957 int ret = 0; 3958 u64 ino = parent_ref->dir; 3959 u64 ino_gen = parent_ref->dir_gen; 3960 u64 parent_ino_before, parent_ino_after; 3961 struct fs_path *path_before = NULL; 3962 struct fs_path *path_after = NULL; 3963 int len1, len2; 3964 3965 path_after = fs_path_alloc(); 3966 path_before = fs_path_alloc(); 3967 if (!path_after || !path_before) { 3968 ret = -ENOMEM; 3969 goto out; 3970 } 3971 3972 /* 3973 * Our current directory inode may not yet be renamed/moved because some 3974 * ancestor (immediate or not) has to be renamed/moved first. So find if 3975 * such ancestor exists and make sure our own rename/move happens after 3976 * that ancestor is processed to avoid path build infinite loops (done 3977 * at get_cur_path()). 3978 */ 3979 while (ino > BTRFS_FIRST_FREE_OBJECTID) { 3980 u64 parent_ino_after_gen; 3981 3982 if (is_waiting_for_move(sctx, ino)) { 3983 /* 3984 * If the current inode is an ancestor of ino in the 3985 * parent root, we need to delay the rename of the 3986 * current inode, otherwise don't delayed the rename 3987 * because we can end up with a circular dependency 3988 * of renames, resulting in some directories never 3989 * getting the respective rename operations issued in 3990 * the send stream or getting into infinite path build 3991 * loops. 3992 */ 3993 ret = is_ancestor(sctx->parent_root, 3994 sctx->cur_ino, sctx->cur_inode_gen, 3995 ino, path_before); 3996 if (ret) 3997 break; 3998 } 3999 4000 fs_path_reset(path_before); 4001 fs_path_reset(path_after); 4002 4003 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after, 4004 &parent_ino_after_gen, path_after); 4005 if (ret < 0) 4006 goto out; 4007 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before, 4008 NULL, path_before); 4009 if (ret < 0 && ret != -ENOENT) { 4010 goto out; 4011 } else if (ret == -ENOENT) { 4012 ret = 0; 4013 break; 4014 } 4015 4016 len1 = fs_path_len(path_before); 4017 len2 = fs_path_len(path_after); 4018 if (ino > sctx->cur_ino && 4019 (parent_ino_before != parent_ino_after || len1 != len2 || 4020 memcmp(path_before->start, path_after->start, len1))) { 4021 u64 parent_ino_gen; 4022 4023 ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen); 4024 if (ret < 0) 4025 goto out; 4026 if (ino_gen == parent_ino_gen) { 4027 ret = 1; 4028 break; 4029 } 4030 } 4031 ino = parent_ino_after; 4032 ino_gen = parent_ino_after_gen; 4033 } 4034 4035 out: 4036 fs_path_free(path_before); 4037 fs_path_free(path_after); 4038 4039 if (ret == 1) { 4040 ret = add_pending_dir_move(sctx, 4041 sctx->cur_ino, 4042 sctx->cur_inode_gen, 4043 ino, 4044 &sctx->new_refs, 4045 &sctx->deleted_refs, 4046 is_orphan); 4047 if (!ret) 4048 ret = 1; 4049 } 4050 4051 return ret; 4052 } 4053 4054 static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 4055 { 4056 int ret; 4057 struct fs_path *new_path; 4058 4059 /* 4060 * Our reference's name member points to its full_path member string, so 4061 * we use here a new path. 4062 */ 4063 new_path = fs_path_alloc(); 4064 if (!new_path) 4065 return -ENOMEM; 4066 4067 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path); 4068 if (ret < 0) { 4069 fs_path_free(new_path); 4070 return ret; 4071 } 4072 ret = fs_path_add(new_path, ref->name, ref->name_len); 4073 if (ret < 0) { 4074 fs_path_free(new_path); 4075 return ret; 4076 } 4077 4078 fs_path_free(ref->full_path); 4079 set_ref_path(ref, new_path); 4080 4081 return 0; 4082 } 4083 4084 /* 4085 * When processing the new references for an inode we may orphanize an existing 4086 * directory inode because its old name conflicts with one of the new references 4087 * of the current inode. Later, when processing another new reference of our 4088 * inode, we might need to orphanize another inode, but the path we have in the 4089 * reference reflects the pre-orphanization name of the directory we previously 4090 * orphanized. For example: 4091 * 4092 * parent snapshot looks like: 4093 * 4094 * . (ino 256) 4095 * |----- f1 (ino 257) 4096 * |----- f2 (ino 258) 4097 * |----- d1/ (ino 259) 4098 * |----- d2/ (ino 260) 4099 * 4100 * send snapshot looks like: 4101 * 4102 * . (ino 256) 4103 * |----- d1 (ino 258) 4104 * |----- f2/ (ino 259) 4105 * |----- f2_link/ (ino 260) 4106 * | |----- f1 (ino 257) 4107 * | 4108 * |----- d2 (ino 258) 4109 * 4110 * When processing inode 257 we compute the name for inode 259 as "d1", and we 4111 * cache it in the name cache. Later when we start processing inode 258, when 4112 * collecting all its new references we set a full path of "d1/d2" for its new 4113 * reference with name "d2". When we start processing the new references we 4114 * start by processing the new reference with name "d1", and this results in 4115 * orphanizing inode 259, since its old reference causes a conflict. Then we 4116 * move on the next new reference, with name "d2", and we find out we must 4117 * orphanize inode 260, as its old reference conflicts with ours - but for the 4118 * orphanization we use a source path corresponding to the path we stored in the 4119 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the 4120 * receiver fail since the path component "d1/" no longer exists, it was renamed 4121 * to "o259-6-0/" when processing the previous new reference. So in this case we 4122 * must recompute the path in the new reference and use it for the new 4123 * orphanization operation. 4124 */ 4125 static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) 4126 { 4127 char *name; 4128 int ret; 4129 4130 name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); 4131 if (!name) 4132 return -ENOMEM; 4133 4134 fs_path_reset(ref->full_path); 4135 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); 4136 if (ret < 0) 4137 goto out; 4138 4139 ret = fs_path_add(ref->full_path, name, ref->name_len); 4140 if (ret < 0) 4141 goto out; 4142 4143 /* Update the reference's base name pointer. */ 4144 set_ref_path(ref, ref->full_path); 4145 out: 4146 kfree(name); 4147 return ret; 4148 } 4149 4150 static int rename_current_inode(struct send_ctx *sctx, 4151 struct fs_path *current_path, 4152 struct fs_path *new_path) 4153 { 4154 int ret; 4155 4156 ret = send_rename(sctx, current_path, new_path); 4157 if (ret < 0) 4158 return ret; 4159 4160 ret = fs_path_copy(&sctx->cur_inode_path, new_path); 4161 if (ret < 0) 4162 return ret; 4163 4164 return fs_path_copy(current_path, new_path); 4165 } 4166 4167 /* 4168 * This does all the move/link/unlink/rmdir magic. 4169 */ 4170 static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) 4171 { 4172 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 4173 int ret = 0; 4174 struct recorded_ref *cur; 4175 struct recorded_ref *cur2; 4176 LIST_HEAD(check_dirs); 4177 struct fs_path *valid_path = NULL; 4178 u64 ow_inode = 0; 4179 u64 ow_gen; 4180 u64 ow_mode; 4181 u64 last_dir_ino_rm = 0; 4182 bool did_overwrite = false; 4183 bool is_orphan = false; 4184 bool can_rename = true; 4185 bool orphanized_dir = false; 4186 bool orphanized_ancestor = false; 4187 4188 /* 4189 * This should never happen as the root dir always has the same ref 4190 * which is always '..' 4191 */ 4192 if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) { 4193 btrfs_err(fs_info, 4194 "send: unexpected inode %llu in process_recorded_refs()", 4195 sctx->cur_ino); 4196 ret = -EINVAL; 4197 goto out; 4198 } 4199 4200 valid_path = fs_path_alloc(); 4201 if (!valid_path) { 4202 ret = -ENOMEM; 4203 goto out; 4204 } 4205 4206 /* 4207 * First, check if the first ref of the current inode was overwritten 4208 * before. If yes, we know that the current inode was already orphanized 4209 * and thus use the orphan name. If not, we can use get_cur_path to 4210 * get the path of the first ref as it would like while receiving at 4211 * this point in time. 4212 * New inodes are always orphan at the beginning, so force to use the 4213 * orphan name in this case. 4214 * The first ref is stored in valid_path and will be updated if it 4215 * gets moved around. 4216 */ 4217 if (!sctx->cur_inode_new) { 4218 ret = did_overwrite_first_ref(sctx, sctx->cur_ino, 4219 sctx->cur_inode_gen); 4220 if (ret < 0) 4221 goto out; 4222 if (ret) 4223 did_overwrite = true; 4224 } 4225 if (sctx->cur_inode_new || did_overwrite) { 4226 ret = gen_unique_name(sctx, sctx->cur_ino, 4227 sctx->cur_inode_gen, valid_path); 4228 if (ret < 0) 4229 goto out; 4230 is_orphan = true; 4231 } else { 4232 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, 4233 valid_path); 4234 if (ret < 0) 4235 goto out; 4236 } 4237 4238 /* 4239 * Before doing any rename and link operations, do a first pass on the 4240 * new references to orphanize any unprocessed inodes that may have a 4241 * reference that conflicts with one of the new references of the current 4242 * inode. This needs to happen first because a new reference may conflict 4243 * with the old reference of a parent directory, so we must make sure 4244 * that the path used for link and rename commands don't use an 4245 * orphanized name when an ancestor was not yet orphanized. 4246 * 4247 * Example: 4248 * 4249 * Parent snapshot: 4250 * 4251 * . (ino 256) 4252 * |----- testdir/ (ino 259) 4253 * | |----- a (ino 257) 4254 * | 4255 * |----- b (ino 258) 4256 * 4257 * Send snapshot: 4258 * 4259 * . (ino 256) 4260 * |----- testdir_2/ (ino 259) 4261 * | |----- a (ino 260) 4262 * | 4263 * |----- testdir (ino 257) 4264 * |----- b (ino 257) 4265 * |----- b2 (ino 258) 4266 * 4267 * Processing the new reference for inode 257 with name "b" may happen 4268 * before processing the new reference with name "testdir". If so, we 4269 * must make sure that by the time we send a link command to create the 4270 * hard link "b", inode 259 was already orphanized, since the generated 4271 * path in "valid_path" already contains the orphanized name for 259. 4272 * We are processing inode 257, so only later when processing 259 we do 4273 * the rename operation to change its temporary (orphanized) name to 4274 * "testdir_2". 4275 */ 4276 list_for_each_entry(cur, &sctx->new_refs, list) { 4277 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); 4278 if (ret < 0) 4279 goto out; 4280 if (ret == inode_state_will_create) 4281 continue; 4282 4283 /* 4284 * Check if this new ref would overwrite the first ref of another 4285 * unprocessed inode. If yes, orphanize the overwritten inode. 4286 * If we find an overwritten ref that is not the first ref, 4287 * simply unlink it. 4288 */ 4289 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, 4290 cur->name, cur->name_len, 4291 &ow_inode, &ow_gen, &ow_mode); 4292 if (ret < 0) 4293 goto out; 4294 if (ret) { 4295 ret = is_first_ref(sctx->parent_root, 4296 ow_inode, cur->dir, cur->name, 4297 cur->name_len); 4298 if (ret < 0) 4299 goto out; 4300 if (ret) { 4301 struct name_cache_entry *nce; 4302 struct waiting_dir_move *wdm; 4303 4304 if (orphanized_dir) { 4305 ret = refresh_ref_path(sctx, cur); 4306 if (ret < 0) 4307 goto out; 4308 } 4309 4310 ret = orphanize_inode(sctx, ow_inode, ow_gen, 4311 cur->full_path); 4312 if (ret < 0) 4313 goto out; 4314 if (S_ISDIR(ow_mode)) 4315 orphanized_dir = true; 4316 4317 /* 4318 * If ow_inode has its rename operation delayed 4319 * make sure that its orphanized name is used in 4320 * the source path when performing its rename 4321 * operation. 4322 */ 4323 wdm = get_waiting_dir_move(sctx, ow_inode); 4324 if (wdm) 4325 wdm->orphanized = true; 4326 4327 /* 4328 * Make sure we clear our orphanized inode's 4329 * name from the name cache. This is because the 4330 * inode ow_inode might be an ancestor of some 4331 * other inode that will be orphanized as well 4332 * later and has an inode number greater than 4333 * sctx->send_progress. We need to prevent 4334 * future name lookups from using the old name 4335 * and get instead the orphan name. 4336 */ 4337 nce = name_cache_search(sctx, ow_inode, ow_gen); 4338 if (nce) 4339 btrfs_lru_cache_remove(&sctx->name_cache, 4340 &nce->entry); 4341 4342 /* 4343 * ow_inode might currently be an ancestor of 4344 * cur_ino, therefore compute valid_path (the 4345 * current path of cur_ino) again because it 4346 * might contain the pre-orphanization name of 4347 * ow_inode, which is no longer valid. 4348 */ 4349 ret = is_ancestor(sctx->parent_root, 4350 ow_inode, ow_gen, 4351 sctx->cur_ino, NULL); 4352 if (ret > 0) { 4353 orphanized_ancestor = true; 4354 fs_path_reset(valid_path); 4355 fs_path_reset(&sctx->cur_inode_path); 4356 ret = get_cur_path(sctx, sctx->cur_ino, 4357 sctx->cur_inode_gen, 4358 valid_path); 4359 } 4360 if (ret < 0) 4361 goto out; 4362 } else { 4363 /* 4364 * If we previously orphanized a directory that 4365 * collided with a new reference that we already 4366 * processed, recompute the current path because 4367 * that directory may be part of the path. 4368 */ 4369 if (orphanized_dir) { 4370 ret = refresh_ref_path(sctx, cur); 4371 if (ret < 0) 4372 goto out; 4373 } 4374 ret = send_unlink(sctx, cur->full_path); 4375 if (ret < 0) 4376 goto out; 4377 } 4378 } 4379 4380 } 4381 4382 list_for_each_entry(cur, &sctx->new_refs, list) { 4383 /* 4384 * We may have refs where the parent directory does not exist 4385 * yet. This happens if the parent directories inum is higher 4386 * than the current inum. To handle this case, we create the 4387 * parent directory out of order. But we need to check if this 4388 * did already happen before due to other refs in the same dir. 4389 */ 4390 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); 4391 if (ret < 0) 4392 goto out; 4393 if (ret == inode_state_will_create) { 4394 ret = 0; 4395 /* 4396 * First check if any of the current inodes refs did 4397 * already create the dir. 4398 */ 4399 list_for_each_entry(cur2, &sctx->new_refs, list) { 4400 if (cur == cur2) 4401 break; 4402 if (cur2->dir == cur->dir) { 4403 ret = 1; 4404 break; 4405 } 4406 } 4407 4408 /* 4409 * If that did not happen, check if a previous inode 4410 * did already create the dir. 4411 */ 4412 if (!ret) 4413 ret = did_create_dir(sctx, cur->dir); 4414 if (ret < 0) 4415 goto out; 4416 if (!ret) { 4417 ret = send_create_inode(sctx, cur->dir); 4418 if (ret < 0) 4419 goto out; 4420 cache_dir_created(sctx, cur->dir); 4421 } 4422 } 4423 4424 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { 4425 ret = wait_for_dest_dir_move(sctx, cur, is_orphan); 4426 if (ret < 0) 4427 goto out; 4428 if (ret == 1) { 4429 can_rename = false; 4430 *pending_move = 1; 4431 } 4432 } 4433 4434 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && 4435 can_rename) { 4436 ret = wait_for_parent_move(sctx, cur, is_orphan); 4437 if (ret < 0) 4438 goto out; 4439 if (ret == 1) { 4440 can_rename = false; 4441 *pending_move = 1; 4442 } 4443 } 4444 4445 /* 4446 * link/move the ref to the new place. If we have an orphan 4447 * inode, move it and update valid_path. If not, link or move 4448 * it depending on the inode mode. 4449 */ 4450 if (is_orphan && can_rename) { 4451 ret = rename_current_inode(sctx, valid_path, cur->full_path); 4452 if (ret < 0) 4453 goto out; 4454 is_orphan = false; 4455 } else if (can_rename) { 4456 if (S_ISDIR(sctx->cur_inode_mode)) { 4457 /* 4458 * Dirs can't be linked, so move it. For moved 4459 * dirs, we always have one new and one deleted 4460 * ref. The deleted ref is ignored later. 4461 */ 4462 ret = rename_current_inode(sctx, valid_path, 4463 cur->full_path); 4464 if (ret < 0) 4465 goto out; 4466 } else { 4467 /* 4468 * We might have previously orphanized an inode 4469 * which is an ancestor of our current inode, 4470 * so our reference's full path, which was 4471 * computed before any such orphanizations, must 4472 * be updated. 4473 */ 4474 if (orphanized_dir) { 4475 ret = update_ref_path(sctx, cur); 4476 if (ret < 0) 4477 goto out; 4478 } 4479 ret = send_link(sctx, cur->full_path, 4480 valid_path); 4481 if (ret < 0) 4482 goto out; 4483 } 4484 } 4485 ret = dup_ref(cur, &check_dirs); 4486 if (ret < 0) 4487 goto out; 4488 } 4489 4490 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) { 4491 /* 4492 * Check if we can already rmdir the directory. If not, 4493 * orphanize it. For every dir item inside that gets deleted 4494 * later, we do this check again and rmdir it then if possible. 4495 * See the use of check_dirs for more details. 4496 */ 4497 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen); 4498 if (ret < 0) 4499 goto out; 4500 if (ret) { 4501 ret = send_rmdir(sctx, valid_path); 4502 if (ret < 0) 4503 goto out; 4504 } else if (!is_orphan) { 4505 ret = orphanize_inode(sctx, sctx->cur_ino, 4506 sctx->cur_inode_gen, valid_path); 4507 if (ret < 0) 4508 goto out; 4509 is_orphan = true; 4510 } 4511 4512 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4513 ret = dup_ref(cur, &check_dirs); 4514 if (ret < 0) 4515 goto out; 4516 } 4517 } else if (S_ISDIR(sctx->cur_inode_mode) && 4518 !list_empty(&sctx->deleted_refs)) { 4519 /* 4520 * We have a moved dir. Add the old parent to check_dirs 4521 */ 4522 cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); 4523 ret = dup_ref(cur, &check_dirs); 4524 if (ret < 0) 4525 goto out; 4526 } else if (!S_ISDIR(sctx->cur_inode_mode)) { 4527 /* 4528 * We have a non dir inode. Go through all deleted refs and 4529 * unlink them if they were not already overwritten by other 4530 * inodes. 4531 */ 4532 list_for_each_entry(cur, &sctx->deleted_refs, list) { 4533 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen, 4534 sctx->cur_ino, sctx->cur_inode_gen, 4535 cur->name, cur->name_len); 4536 if (ret < 0) 4537 goto out; 4538 if (!ret) { 4539 /* 4540 * If we orphanized any ancestor before, we need 4541 * to recompute the full path for deleted names, 4542 * since any such path was computed before we 4543 * processed any references and orphanized any 4544 * ancestor inode. 4545 */ 4546 if (orphanized_ancestor) { 4547 ret = update_ref_path(sctx, cur); 4548 if (ret < 0) 4549 goto out; 4550 } 4551 ret = send_unlink(sctx, cur->full_path); 4552 if (ret < 0) 4553 goto out; 4554 if (is_current_inode_path(sctx, cur->full_path)) 4555 fs_path_reset(&sctx->cur_inode_path); 4556 } 4557 ret = dup_ref(cur, &check_dirs); 4558 if (ret < 0) 4559 goto out; 4560 } 4561 /* 4562 * If the inode is still orphan, unlink the orphan. This may 4563 * happen when a previous inode did overwrite the first ref 4564 * of this inode and no new refs were added for the current 4565 * inode. Unlinking does not mean that the inode is deleted in 4566 * all cases. There may still be links to this inode in other 4567 * places. 4568 */ 4569 if (is_orphan) { 4570 ret = send_unlink(sctx, valid_path); 4571 if (ret < 0) 4572 goto out; 4573 } 4574 } 4575 4576 /* 4577 * We did collect all parent dirs where cur_inode was once located. We 4578 * now go through all these dirs and check if they are pending for 4579 * deletion and if it's finally possible to perform the rmdir now. 4580 * We also update the inode stats of the parent dirs here. 4581 */ 4582 list_for_each_entry(cur, &check_dirs, list) { 4583 /* 4584 * In case we had refs into dirs that were not processed yet, 4585 * we don't need to do the utime and rmdir logic for these dirs. 4586 * The dir will be processed later. 4587 */ 4588 if (cur->dir > sctx->cur_ino) 4589 continue; 4590 4591 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL); 4592 if (ret < 0) 4593 goto out; 4594 4595 if (ret == inode_state_did_create || 4596 ret == inode_state_no_change) { 4597 ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen); 4598 if (ret < 0) 4599 goto out; 4600 } else if (ret == inode_state_did_delete && 4601 cur->dir != last_dir_ino_rm) { 4602 ret = can_rmdir(sctx, cur->dir, cur->dir_gen); 4603 if (ret < 0) 4604 goto out; 4605 if (ret) { 4606 ret = get_cur_path(sctx, cur->dir, 4607 cur->dir_gen, valid_path); 4608 if (ret < 0) 4609 goto out; 4610 ret = send_rmdir(sctx, valid_path); 4611 if (ret < 0) 4612 goto out; 4613 last_dir_ino_rm = cur->dir; 4614 } 4615 } 4616 } 4617 4618 ret = 0; 4619 4620 out: 4621 __free_recorded_refs(&check_dirs); 4622 free_recorded_refs(sctx); 4623 fs_path_free(valid_path); 4624 return ret; 4625 } 4626 4627 static int rbtree_ref_comp(const void *k, const struct rb_node *node) 4628 { 4629 const struct recorded_ref *data = k; 4630 const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); 4631 int result; 4632 4633 if (data->dir > ref->dir) 4634 return 1; 4635 if (data->dir < ref->dir) 4636 return -1; 4637 if (data->dir_gen > ref->dir_gen) 4638 return 1; 4639 if (data->dir_gen < ref->dir_gen) 4640 return -1; 4641 if (data->name_len > ref->name_len) 4642 return 1; 4643 if (data->name_len < ref->name_len) 4644 return -1; 4645 result = strcmp(data->name, ref->name); 4646 if (result > 0) 4647 return 1; 4648 if (result < 0) 4649 return -1; 4650 return 0; 4651 } 4652 4653 static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) 4654 { 4655 const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); 4656 4657 return rbtree_ref_comp(entry, parent) < 0; 4658 } 4659 4660 static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, 4661 struct fs_path *name, u64 dir, u64 dir_gen, 4662 struct send_ctx *sctx) 4663 { 4664 int ret = 0; 4665 struct fs_path *path = NULL; 4666 struct recorded_ref *ref = NULL; 4667 4668 path = fs_path_alloc(); 4669 if (!path) { 4670 ret = -ENOMEM; 4671 goto out; 4672 } 4673 4674 ref = recorded_ref_alloc(); 4675 if (!ref) { 4676 ret = -ENOMEM; 4677 goto out; 4678 } 4679 4680 ret = get_cur_path(sctx, dir, dir_gen, path); 4681 if (ret < 0) 4682 goto out; 4683 ret = fs_path_add_path(path, name); 4684 if (ret < 0) 4685 goto out; 4686 4687 ref->dir = dir; 4688 ref->dir_gen = dir_gen; 4689 set_ref_path(ref, path); 4690 list_add_tail(&ref->list, refs); 4691 rb_add(&ref->node, root, rbtree_ref_less); 4692 ref->root = root; 4693 out: 4694 if (ret) { 4695 if (path && (!ref || !ref->full_path)) 4696 fs_path_free(path); 4697 recorded_ref_free(ref); 4698 } 4699 return ret; 4700 } 4701 4702 static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) 4703 { 4704 int ret; 4705 struct send_ctx *sctx = ctx; 4706 struct rb_node *node = NULL; 4707 struct recorded_ref data; 4708 struct recorded_ref *ref; 4709 u64 dir_gen; 4710 4711 ret = get_inode_gen(sctx->send_root, dir, &dir_gen); 4712 if (ret < 0) 4713 return ret; 4714 4715 data.dir = dir; 4716 data.dir_gen = dir_gen; 4717 set_ref_path(&data, name); 4718 node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); 4719 if (node) { 4720 ref = rb_entry(node, struct recorded_ref, node); 4721 recorded_ref_free(ref); 4722 } else { 4723 ret = record_ref_in_tree(&sctx->rbtree_new_refs, 4724 &sctx->new_refs, name, dir, dir_gen, 4725 sctx); 4726 } 4727 4728 return ret; 4729 } 4730 4731 static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) 4732 { 4733 int ret; 4734 struct send_ctx *sctx = ctx; 4735 struct rb_node *node = NULL; 4736 struct recorded_ref data; 4737 struct recorded_ref *ref; 4738 u64 dir_gen; 4739 4740 ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); 4741 if (ret < 0) 4742 return ret; 4743 4744 data.dir = dir; 4745 data.dir_gen = dir_gen; 4746 set_ref_path(&data, name); 4747 node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); 4748 if (node) { 4749 ref = rb_entry(node, struct recorded_ref, node); 4750 recorded_ref_free(ref); 4751 } else { 4752 ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, 4753 &sctx->deleted_refs, name, dir, 4754 dir_gen, sctx); 4755 } 4756 4757 return ret; 4758 } 4759 4760 static int record_new_ref(struct send_ctx *sctx) 4761 { 4762 int ret; 4763 4764 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4765 sctx->cmp_key, 0, record_new_ref_if_needed, sctx); 4766 if (ret < 0) 4767 return ret; 4768 4769 return 0; 4770 } 4771 4772 static int record_deleted_ref(struct send_ctx *sctx) 4773 { 4774 int ret; 4775 4776 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4777 sctx->cmp_key, 0, record_deleted_ref_if_needed, 4778 sctx); 4779 if (ret < 0) 4780 return ret; 4781 4782 return 0; 4783 } 4784 4785 static int record_changed_ref(struct send_ctx *sctx) 4786 { 4787 int ret; 4788 4789 ret = iterate_inode_ref(sctx->send_root, sctx->left_path, 4790 sctx->cmp_key, 0, record_new_ref_if_needed, sctx); 4791 if (ret < 0) 4792 return ret; 4793 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, 4794 sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); 4795 if (ret < 0) 4796 return ret; 4797 4798 return 0; 4799 } 4800 4801 /* 4802 * Record and process all refs at once. Needed when an inode changes the 4803 * generation number, which means that it was deleted and recreated. 4804 */ 4805 static int process_all_refs(struct send_ctx *sctx, 4806 enum btrfs_compare_tree_result cmd) 4807 { 4808 int ret = 0; 4809 int iter_ret = 0; 4810 struct btrfs_root *root; 4811 struct btrfs_path *path; 4812 struct btrfs_key key; 4813 struct btrfs_key found_key; 4814 iterate_inode_ref_t cb; 4815 int pending_move = 0; 4816 4817 path = alloc_path_for_send(); 4818 if (!path) 4819 return -ENOMEM; 4820 4821 if (cmd == BTRFS_COMPARE_TREE_NEW) { 4822 root = sctx->send_root; 4823 cb = record_new_ref_if_needed; 4824 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { 4825 root = sctx->parent_root; 4826 cb = record_deleted_ref_if_needed; 4827 } else { 4828 btrfs_err(sctx->send_root->fs_info, 4829 "Wrong command %d in process_all_refs", cmd); 4830 ret = -EINVAL; 4831 goto out; 4832 } 4833 4834 key.objectid = sctx->cmp_key->objectid; 4835 key.type = BTRFS_INODE_REF_KEY; 4836 key.offset = 0; 4837 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 4838 if (found_key.objectid != key.objectid || 4839 (found_key.type != BTRFS_INODE_REF_KEY && 4840 found_key.type != BTRFS_INODE_EXTREF_KEY)) 4841 break; 4842 4843 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); 4844 if (ret < 0) 4845 goto out; 4846 } 4847 /* Catch error found during iteration */ 4848 if (iter_ret < 0) { 4849 ret = iter_ret; 4850 goto out; 4851 } 4852 btrfs_release_path(path); 4853 4854 /* 4855 * We don't actually care about pending_move as we are simply 4856 * re-creating this inode and will be rename'ing it into place once we 4857 * rename the parent directory. 4858 */ 4859 ret = process_recorded_refs(sctx, &pending_move); 4860 out: 4861 btrfs_free_path(path); 4862 return ret; 4863 } 4864 4865 static int send_set_xattr(struct send_ctx *sctx, 4866 const char *name, int name_len, 4867 const char *data, int data_len) 4868 { 4869 struct fs_path *path; 4870 int ret; 4871 4872 path = get_cur_inode_path(sctx); 4873 if (IS_ERR(path)) 4874 return PTR_ERR(path); 4875 4876 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); 4877 if (ret < 0) 4878 return ret; 4879 4880 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4881 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4882 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len); 4883 4884 ret = send_cmd(sctx); 4885 4886 tlv_put_failure: 4887 return ret; 4888 } 4889 4890 static int send_remove_xattr(struct send_ctx *sctx, 4891 struct fs_path *path, 4892 const char *name, int name_len) 4893 { 4894 int ret; 4895 4896 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); 4897 if (ret < 0) 4898 return ret; 4899 4900 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 4901 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); 4902 4903 ret = send_cmd(sctx); 4904 4905 tlv_put_failure: 4906 return ret; 4907 } 4908 4909 static int __process_new_xattr(int num, struct btrfs_key *di_key, 4910 const char *name, int name_len, const char *data, 4911 int data_len, void *ctx) 4912 { 4913 struct send_ctx *sctx = ctx; 4914 struct posix_acl_xattr_header dummy_acl; 4915 4916 /* Capabilities are emitted by finish_inode_if_needed */ 4917 if (!strncmp(name, XATTR_NAME_CAPS, name_len)) 4918 return 0; 4919 4920 /* 4921 * This hack is needed because empty acls are stored as zero byte 4922 * data in xattrs. Problem with that is, that receiving these zero byte 4923 * acls will fail later. To fix this, we send a dummy acl list that 4924 * only contains the version number and no entries. 4925 */ 4926 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || 4927 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) { 4928 if (data_len == 0) { 4929 dummy_acl.a_version = 4930 cpu_to_le32(POSIX_ACL_XATTR_VERSION); 4931 data = (char *)&dummy_acl; 4932 data_len = sizeof(dummy_acl); 4933 } 4934 } 4935 4936 return send_set_xattr(sctx, name, name_len, data, data_len); 4937 } 4938 4939 static int __process_deleted_xattr(int num, struct btrfs_key *di_key, 4940 const char *name, int name_len, 4941 const char *data, int data_len, void *ctx) 4942 { 4943 struct send_ctx *sctx = ctx; 4944 struct fs_path *p; 4945 4946 p = get_cur_inode_path(sctx); 4947 if (IS_ERR(p)) 4948 return PTR_ERR(p); 4949 4950 return send_remove_xattr(sctx, p, name, name_len); 4951 } 4952 4953 static int process_new_xattr(struct send_ctx *sctx) 4954 { 4955 return iterate_dir_item(sctx->send_root, sctx->left_path, 4956 __process_new_xattr, sctx); 4957 } 4958 4959 static int process_deleted_xattr(struct send_ctx *sctx) 4960 { 4961 return iterate_dir_item(sctx->parent_root, sctx->right_path, 4962 __process_deleted_xattr, sctx); 4963 } 4964 4965 struct find_xattr_ctx { 4966 const char *name; 4967 int name_len; 4968 int found_idx; 4969 char *found_data; 4970 int found_data_len; 4971 }; 4972 4973 static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, 4974 int name_len, const char *data, int data_len, void *vctx) 4975 { 4976 struct find_xattr_ctx *ctx = vctx; 4977 4978 if (name_len == ctx->name_len && 4979 strncmp(name, ctx->name, name_len) == 0) { 4980 ctx->found_idx = num; 4981 ctx->found_data_len = data_len; 4982 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); 4983 if (!ctx->found_data) 4984 return -ENOMEM; 4985 return 1; 4986 } 4987 return 0; 4988 } 4989 4990 static int find_xattr(struct btrfs_root *root, 4991 struct btrfs_path *path, 4992 struct btrfs_key *key, 4993 const char *name, int name_len, 4994 char **data, int *data_len) 4995 { 4996 int ret; 4997 struct find_xattr_ctx ctx; 4998 4999 ctx.name = name; 5000 ctx.name_len = name_len; 5001 ctx.found_idx = -1; 5002 ctx.found_data = NULL; 5003 ctx.found_data_len = 0; 5004 5005 ret = iterate_dir_item(root, path, __find_xattr, &ctx); 5006 if (ret < 0) 5007 return ret; 5008 5009 if (ctx.found_idx == -1) 5010 return -ENOENT; 5011 if (data) { 5012 *data = ctx.found_data; 5013 *data_len = ctx.found_data_len; 5014 } else { 5015 kfree(ctx.found_data); 5016 } 5017 return ctx.found_idx; 5018 } 5019 5020 5021 static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, 5022 const char *name, int name_len, 5023 const char *data, int data_len, 5024 void *ctx) 5025 { 5026 int ret; 5027 struct send_ctx *sctx = ctx; 5028 char *found_data = NULL; 5029 int found_data_len = 0; 5030 5031 ret = find_xattr(sctx->parent_root, sctx->right_path, 5032 sctx->cmp_key, name, name_len, &found_data, 5033 &found_data_len); 5034 if (ret == -ENOENT) { 5035 ret = __process_new_xattr(num, di_key, name, name_len, data, 5036 data_len, ctx); 5037 } else if (ret >= 0) { 5038 if (data_len != found_data_len || 5039 memcmp(data, found_data, data_len)) { 5040 ret = __process_new_xattr(num, di_key, name, name_len, 5041 data, data_len, ctx); 5042 } else { 5043 ret = 0; 5044 } 5045 } 5046 5047 kfree(found_data); 5048 return ret; 5049 } 5050 5051 static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, 5052 const char *name, int name_len, 5053 const char *data, int data_len, 5054 void *ctx) 5055 { 5056 int ret; 5057 struct send_ctx *sctx = ctx; 5058 5059 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key, 5060 name, name_len, NULL, NULL); 5061 if (ret == -ENOENT) 5062 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 5063 data_len, ctx); 5064 else if (ret >= 0) 5065 ret = 0; 5066 5067 return ret; 5068 } 5069 5070 static int process_changed_xattr(struct send_ctx *sctx) 5071 { 5072 int ret; 5073 5074 ret = iterate_dir_item(sctx->send_root, sctx->left_path, 5075 __process_changed_new_xattr, sctx); 5076 if (ret < 0) 5077 return ret; 5078 5079 return iterate_dir_item(sctx->parent_root, sctx->right_path, 5080 __process_changed_deleted_xattr, sctx); 5081 } 5082 5083 static int process_all_new_xattrs(struct send_ctx *sctx) 5084 { 5085 int ret = 0; 5086 int iter_ret = 0; 5087 struct btrfs_root *root; 5088 struct btrfs_path *path; 5089 struct btrfs_key key; 5090 struct btrfs_key found_key; 5091 5092 path = alloc_path_for_send(); 5093 if (!path) 5094 return -ENOMEM; 5095 5096 root = sctx->send_root; 5097 5098 key.objectid = sctx->cmp_key->objectid; 5099 key.type = BTRFS_XATTR_ITEM_KEY; 5100 key.offset = 0; 5101 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 5102 if (found_key.objectid != key.objectid || 5103 found_key.type != key.type) { 5104 ret = 0; 5105 break; 5106 } 5107 5108 ret = iterate_dir_item(root, path, __process_new_xattr, sctx); 5109 if (ret < 0) 5110 break; 5111 } 5112 /* Catch error found during iteration */ 5113 if (iter_ret < 0) 5114 ret = iter_ret; 5115 5116 btrfs_free_path(path); 5117 return ret; 5118 } 5119 5120 static int send_verity(struct send_ctx *sctx, struct fs_path *path, 5121 struct fsverity_descriptor *desc) 5122 { 5123 int ret; 5124 5125 ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); 5126 if (ret < 0) 5127 return ret; 5128 5129 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); 5130 TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, 5131 le8_to_cpu(desc->hash_algorithm)); 5132 TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE, 5133 1U << le8_to_cpu(desc->log_blocksize)); 5134 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt, 5135 le8_to_cpu(desc->salt_size)); 5136 TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature, 5137 le32_to_cpu(desc->sig_size)); 5138 5139 ret = send_cmd(sctx); 5140 5141 tlv_put_failure: 5142 return ret; 5143 } 5144 5145 static int process_verity(struct send_ctx *sctx) 5146 { 5147 int ret = 0; 5148 struct btrfs_inode *inode; 5149 struct fs_path *p; 5150 5151 inode = btrfs_iget(sctx->cur_ino, sctx->send_root); 5152 if (IS_ERR(inode)) 5153 return PTR_ERR(inode); 5154 5155 ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); 5156 if (ret < 0) 5157 goto iput; 5158 5159 if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { 5160 ret = -EMSGSIZE; 5161 goto iput; 5162 } 5163 if (!sctx->verity_descriptor) { 5164 sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE, 5165 GFP_KERNEL); 5166 if (!sctx->verity_descriptor) { 5167 ret = -ENOMEM; 5168 goto iput; 5169 } 5170 } 5171 5172 ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); 5173 if (ret < 0) 5174 goto iput; 5175 5176 p = get_cur_inode_path(sctx); 5177 if (IS_ERR(p)) { 5178 ret = PTR_ERR(p); 5179 goto iput; 5180 } 5181 5182 ret = send_verity(sctx, p, sctx->verity_descriptor); 5183 iput: 5184 iput(&inode->vfs_inode); 5185 return ret; 5186 } 5187 5188 static inline u64 max_send_read_size(const struct send_ctx *sctx) 5189 { 5190 return sctx->send_max_size - SZ_16K; 5191 } 5192 5193 static int put_data_header(struct send_ctx *sctx, u32 len) 5194 { 5195 if (WARN_ON_ONCE(sctx->put_data)) 5196 return -EINVAL; 5197 sctx->put_data = true; 5198 if (sctx->proto >= 2) { 5199 /* 5200 * Since v2, the data attribute header doesn't include a length, 5201 * it is implicitly to the end of the command. 5202 */ 5203 if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) 5204 return -EOVERFLOW; 5205 put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); 5206 sctx->send_size += sizeof(__le16); 5207 } else { 5208 struct btrfs_tlv_header *hdr; 5209 5210 if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) 5211 return -EOVERFLOW; 5212 hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); 5213 put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); 5214 put_unaligned_le16(len, &hdr->tlv_len); 5215 sctx->send_size += sizeof(*hdr); 5216 } 5217 return 0; 5218 } 5219 5220 static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) 5221 { 5222 struct btrfs_root *root = sctx->send_root; 5223 struct btrfs_fs_info *fs_info = root->fs_info; 5224 u64 cur = offset; 5225 const u64 end = offset + len; 5226 const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); 5227 struct address_space *mapping = sctx->cur_inode->i_mapping; 5228 int ret; 5229 5230 ret = put_data_header(sctx, len); 5231 if (ret) 5232 return ret; 5233 5234 while (cur < end) { 5235 pgoff_t index = (cur >> PAGE_SHIFT); 5236 unsigned int cur_len; 5237 unsigned int pg_offset; 5238 struct folio *folio; 5239 5240 folio = filemap_lock_folio(mapping, index); 5241 if (IS_ERR(folio)) { 5242 page_cache_sync_readahead(mapping, 5243 &sctx->ra, NULL, index, 5244 last_index + 1 - index); 5245 5246 folio = filemap_grab_folio(mapping, index); 5247 if (IS_ERR(folio)) { 5248 ret = PTR_ERR(folio); 5249 break; 5250 } 5251 } 5252 pg_offset = offset_in_folio(folio, cur); 5253 cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); 5254 5255 if (folio_test_readahead(folio)) 5256 page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, 5257 last_index + 1 - index); 5258 5259 if (!folio_test_uptodate(folio)) { 5260 btrfs_read_folio(NULL, folio); 5261 folio_lock(folio); 5262 if (!folio_test_uptodate(folio)) { 5263 folio_unlock(folio); 5264 btrfs_err(fs_info, 5265 "send: IO error at offset %llu for inode %llu root %llu", 5266 folio_pos(folio), sctx->cur_ino, 5267 btrfs_root_id(sctx->send_root)); 5268 folio_put(folio); 5269 ret = -EIO; 5270 break; 5271 } 5272 if (folio->mapping != mapping) { 5273 folio_unlock(folio); 5274 folio_put(folio); 5275 continue; 5276 } 5277 } 5278 5279 memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, 5280 pg_offset, cur_len); 5281 folio_unlock(folio); 5282 folio_put(folio); 5283 cur += cur_len; 5284 sctx->send_size += cur_len; 5285 } 5286 5287 return ret; 5288 } 5289 5290 /* 5291 * Read some bytes from the current inode/file and send a write command to 5292 * user space. 5293 */ 5294 static int send_write(struct send_ctx *sctx, u64 offset, u32 len) 5295 { 5296 int ret = 0; 5297 struct fs_path *p; 5298 5299 p = get_cur_inode_path(sctx); 5300 if (IS_ERR(p)) 5301 return PTR_ERR(p); 5302 5303 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5304 if (ret < 0) 5305 return ret; 5306 5307 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5308 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5309 ret = put_file_data(sctx, offset, len); 5310 if (ret < 0) 5311 return ret; 5312 5313 ret = send_cmd(sctx); 5314 5315 tlv_put_failure: 5316 return ret; 5317 } 5318 5319 /* 5320 * Send a clone command to user space. 5321 */ 5322 static int send_clone(struct send_ctx *sctx, 5323 u64 offset, u32 len, 5324 struct clone_root *clone_root) 5325 { 5326 int ret = 0; 5327 struct fs_path *p; 5328 struct fs_path *cur_inode_path; 5329 u64 gen; 5330 5331 cur_inode_path = get_cur_inode_path(sctx); 5332 if (IS_ERR(cur_inode_path)) 5333 return PTR_ERR(cur_inode_path); 5334 5335 p = fs_path_alloc(); 5336 if (!p) 5337 return -ENOMEM; 5338 5339 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE); 5340 if (ret < 0) 5341 goto out; 5342 5343 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5344 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); 5345 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); 5346 5347 if (clone_root->root == sctx->send_root) { 5348 ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); 5349 if (ret < 0) 5350 goto out; 5351 ret = get_cur_path(sctx, clone_root->ino, gen, p); 5352 } else { 5353 ret = get_inode_path(clone_root->root, clone_root->ino, p); 5354 } 5355 if (ret < 0) 5356 goto out; 5357 5358 /* 5359 * If the parent we're using has a received_uuid set then use that as 5360 * our clone source as that is what we will look for when doing a 5361 * receive. 5362 * 5363 * This covers the case that we create a snapshot off of a received 5364 * subvolume and then use that as the parent and try to receive on a 5365 * different host. 5366 */ 5367 if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) 5368 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5369 clone_root->root->root_item.received_uuid); 5370 else 5371 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 5372 clone_root->root->root_item.uuid); 5373 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 5374 btrfs_root_ctransid(&clone_root->root->root_item)); 5375 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 5376 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 5377 clone_root->offset); 5378 5379 ret = send_cmd(sctx); 5380 5381 tlv_put_failure: 5382 out: 5383 fs_path_free(p); 5384 return ret; 5385 } 5386 5387 /* 5388 * Send an update extent command to user space. 5389 */ 5390 static int send_update_extent(struct send_ctx *sctx, 5391 u64 offset, u32 len) 5392 { 5393 int ret = 0; 5394 struct fs_path *p; 5395 5396 p = get_cur_inode_path(sctx); 5397 if (IS_ERR(p)) 5398 return PTR_ERR(p); 5399 5400 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); 5401 if (ret < 0) 5402 return ret; 5403 5404 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5405 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5406 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); 5407 5408 ret = send_cmd(sctx); 5409 5410 tlv_put_failure: 5411 return ret; 5412 } 5413 5414 static int send_hole(struct send_ctx *sctx, u64 end) 5415 { 5416 struct fs_path *p = NULL; 5417 u64 read_size = max_send_read_size(sctx); 5418 u64 offset = sctx->cur_inode_last_extent; 5419 int ret = 0; 5420 5421 /* 5422 * A hole that starts at EOF or beyond it. Since we do not yet support 5423 * fallocate (for extent preallocation and hole punching), sending a 5424 * write of zeroes starting at EOF or beyond would later require issuing 5425 * a truncate operation which would undo the write and achieve nothing. 5426 */ 5427 if (offset >= sctx->cur_inode_size) 5428 return 0; 5429 5430 /* 5431 * Don't go beyond the inode's i_size due to prealloc extents that start 5432 * after the i_size. 5433 */ 5434 end = min_t(u64, end, sctx->cur_inode_size); 5435 5436 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5437 return send_update_extent(sctx, offset, end - offset); 5438 5439 p = get_cur_inode_path(sctx); 5440 if (IS_ERR(p)) 5441 return PTR_ERR(p); 5442 5443 while (offset < end) { 5444 u64 len = min(end - offset, read_size); 5445 5446 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); 5447 if (ret < 0) 5448 break; 5449 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); 5450 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5451 ret = put_data_header(sctx, len); 5452 if (ret < 0) 5453 break; 5454 memset(sctx->send_buf + sctx->send_size, 0, len); 5455 sctx->send_size += len; 5456 ret = send_cmd(sctx); 5457 if (ret < 0) 5458 break; 5459 offset += len; 5460 } 5461 sctx->cur_inode_next_write_offset = offset; 5462 tlv_put_failure: 5463 return ret; 5464 } 5465 5466 static int send_encoded_inline_extent(struct send_ctx *sctx, 5467 struct btrfs_path *path, u64 offset, 5468 u64 len) 5469 { 5470 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; 5471 struct fs_path *fspath; 5472 struct extent_buffer *leaf = path->nodes[0]; 5473 struct btrfs_key key; 5474 struct btrfs_file_extent_item *ei; 5475 u64 ram_bytes; 5476 size_t inline_size; 5477 int ret; 5478 5479 fspath = get_cur_inode_path(sctx); 5480 if (IS_ERR(fspath)) 5481 return PTR_ERR(fspath); 5482 5483 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); 5484 if (ret < 0) 5485 return ret; 5486 5487 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5488 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 5489 ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); 5490 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); 5491 5492 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); 5493 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5494 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, 5495 min(key.offset + ram_bytes - offset, len)); 5496 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); 5497 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); 5498 ret = btrfs_encoded_io_compression_from_extent(fs_info, 5499 btrfs_file_extent_compression(leaf, ei)); 5500 if (ret < 0) 5501 return ret; 5502 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); 5503 5504 ret = put_data_header(sctx, inline_size); 5505 if (ret < 0) 5506 return ret; 5507 read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, 5508 btrfs_file_extent_inline_start(ei), inline_size); 5509 sctx->send_size += inline_size; 5510 5511 ret = send_cmd(sctx); 5512 5513 tlv_put_failure: 5514 return ret; 5515 } 5516 5517 static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, 5518 u64 offset, u64 len) 5519 { 5520 struct btrfs_root *root = sctx->send_root; 5521 struct btrfs_fs_info *fs_info = root->fs_info; 5522 struct btrfs_inode *inode; 5523 struct fs_path *fspath; 5524 struct extent_buffer *leaf = path->nodes[0]; 5525 struct btrfs_key key; 5526 struct btrfs_file_extent_item *ei; 5527 u64 disk_bytenr, disk_num_bytes; 5528 u32 data_offset; 5529 struct btrfs_cmd_header *hdr; 5530 u32 crc; 5531 int ret; 5532 5533 inode = btrfs_iget(sctx->cur_ino, root); 5534 if (IS_ERR(inode)) 5535 return PTR_ERR(inode); 5536 5537 fspath = get_cur_inode_path(sctx); 5538 if (IS_ERR(fspath)) { 5539 ret = PTR_ERR(fspath); 5540 goto out; 5541 } 5542 5543 ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); 5544 if (ret < 0) 5545 goto out; 5546 5547 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5548 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 5549 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 5550 disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); 5551 5552 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); 5553 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); 5554 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, 5555 min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, 5556 len)); 5557 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, 5558 btrfs_file_extent_ram_bytes(leaf, ei)); 5559 TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, 5560 offset - key.offset + btrfs_file_extent_offset(leaf, ei)); 5561 ret = btrfs_encoded_io_compression_from_extent(fs_info, 5562 btrfs_file_extent_compression(leaf, ei)); 5563 if (ret < 0) 5564 goto out; 5565 TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); 5566 TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); 5567 5568 ret = put_data_header(sctx, disk_num_bytes); 5569 if (ret < 0) 5570 goto out; 5571 5572 /* 5573 * We want to do I/O directly into the send buffer, so get the next page 5574 * boundary in the send buffer. This means that there may be a gap 5575 * between the beginning of the command and the file data. 5576 */ 5577 data_offset = PAGE_ALIGN(sctx->send_size); 5578 if (data_offset > sctx->send_max_size || 5579 sctx->send_max_size - data_offset < disk_num_bytes) { 5580 ret = -EOVERFLOW; 5581 goto out; 5582 } 5583 5584 /* 5585 * Note that send_buf is a mapping of send_buf_pages, so this is really 5586 * reading into send_buf. 5587 */ 5588 ret = btrfs_encoded_read_regular_fill_pages(inode, 5589 disk_bytenr, disk_num_bytes, 5590 sctx->send_buf_pages + 5591 (data_offset >> PAGE_SHIFT), 5592 NULL); 5593 if (ret) 5594 goto out; 5595 5596 hdr = (struct btrfs_cmd_header *)sctx->send_buf; 5597 hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); 5598 hdr->crc = 0; 5599 crc = crc32c(0, sctx->send_buf, sctx->send_size); 5600 crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); 5601 hdr->crc = cpu_to_le32(crc); 5602 5603 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 5604 &sctx->send_off); 5605 if (!ret) { 5606 ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, 5607 disk_num_bytes, &sctx->send_off); 5608 } 5609 sctx->send_size = 0; 5610 sctx->put_data = false; 5611 5612 tlv_put_failure: 5613 out: 5614 iput(&inode->vfs_inode); 5615 return ret; 5616 } 5617 5618 static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, 5619 const u64 offset, const u64 len) 5620 { 5621 const u64 end = offset + len; 5622 struct extent_buffer *leaf = path->nodes[0]; 5623 struct btrfs_file_extent_item *ei; 5624 u64 read_size = max_send_read_size(sctx); 5625 u64 sent = 0; 5626 5627 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) 5628 return send_update_extent(sctx, offset, len); 5629 5630 ei = btrfs_item_ptr(leaf, path->slots[0], 5631 struct btrfs_file_extent_item); 5632 if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && 5633 btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { 5634 bool is_inline = (btrfs_file_extent_type(leaf, ei) == 5635 BTRFS_FILE_EXTENT_INLINE); 5636 5637 /* 5638 * Send the compressed extent unless the compressed data is 5639 * larger than the decompressed data. This can happen if we're 5640 * not sending the entire extent, either because it has been 5641 * partially overwritten/truncated or because this is a part of 5642 * the extent that we couldn't clone in clone_range(). 5643 */ 5644 if (is_inline && 5645 btrfs_file_extent_inline_item_len(leaf, 5646 path->slots[0]) <= len) { 5647 return send_encoded_inline_extent(sctx, path, offset, 5648 len); 5649 } else if (!is_inline && 5650 btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { 5651 return send_encoded_extent(sctx, path, offset, len); 5652 } 5653 } 5654 5655 if (sctx->cur_inode == NULL) { 5656 struct btrfs_inode *btrfs_inode; 5657 struct btrfs_root *root = sctx->send_root; 5658 5659 btrfs_inode = btrfs_iget(sctx->cur_ino, root); 5660 if (IS_ERR(btrfs_inode)) 5661 return PTR_ERR(btrfs_inode); 5662 5663 sctx->cur_inode = &btrfs_inode->vfs_inode; 5664 memset(&sctx->ra, 0, sizeof(struct file_ra_state)); 5665 file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); 5666 5667 /* 5668 * It's very likely there are no pages from this inode in the page 5669 * cache, so after reading extents and sending their data, we clean 5670 * the page cache to avoid trashing the page cache (adding pressure 5671 * to the page cache and forcing eviction of other data more useful 5672 * for applications). 5673 * 5674 * We decide if we should clean the page cache simply by checking 5675 * if the inode's mapping nrpages is 0 when we first open it, and 5676 * not by using something like filemap_range_has_page() before 5677 * reading an extent because when we ask the readahead code to 5678 * read a given file range, it may (and almost always does) read 5679 * pages from beyond that range (see the documentation for 5680 * page_cache_sync_readahead()), so it would not be reliable, 5681 * because after reading the first extent future calls to 5682 * filemap_range_has_page() would return true because the readahead 5683 * on the previous extent resulted in reading pages of the current 5684 * extent as well. 5685 */ 5686 sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); 5687 sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); 5688 } 5689 5690 while (sent < len) { 5691 u64 size = min(len - sent, read_size); 5692 int ret; 5693 5694 ret = send_write(sctx, offset + sent, size); 5695 if (ret < 0) 5696 return ret; 5697 sent += size; 5698 } 5699 5700 if (sctx->clean_page_cache && PAGE_ALIGNED(end)) { 5701 /* 5702 * Always operate only on ranges that are a multiple of the page 5703 * size. This is not only to prevent zeroing parts of a page in 5704 * the case of subpage sector size, but also to guarantee we evict 5705 * pages, as passing a range that is smaller than page size does 5706 * not evict the respective page (only zeroes part of its content). 5707 * 5708 * Always start from the end offset of the last range cleared. 5709 * This is because the readahead code may (and very often does) 5710 * reads pages beyond the range we request for readahead. So if 5711 * we have an extent layout like this: 5712 * 5713 * [ extent A ] [ extent B ] [ extent C ] 5714 * 5715 * When we ask page_cache_sync_readahead() to read extent A, it 5716 * may also trigger reads for pages of extent B. If we are doing 5717 * an incremental send and extent B has not changed between the 5718 * parent and send snapshots, some or all of its pages may end 5719 * up being read and placed in the page cache. So when truncating 5720 * the page cache we always start from the end offset of the 5721 * previously processed extent up to the end of the current 5722 * extent. 5723 */ 5724 truncate_inode_pages_range(&sctx->cur_inode->i_data, 5725 sctx->page_cache_clear_start, 5726 end - 1); 5727 sctx->page_cache_clear_start = end; 5728 } 5729 5730 return 0; 5731 } 5732 5733 /* 5734 * Search for a capability xattr related to sctx->cur_ino. If the capability is 5735 * found, call send_set_xattr function to emit it. 5736 * 5737 * Return 0 if there isn't a capability, or when the capability was emitted 5738 * successfully, or < 0 if an error occurred. 5739 */ 5740 static int send_capabilities(struct send_ctx *sctx) 5741 { 5742 struct btrfs_path *path; 5743 struct btrfs_dir_item *di; 5744 struct extent_buffer *leaf; 5745 unsigned long data_ptr; 5746 char *buf = NULL; 5747 int buf_len; 5748 int ret = 0; 5749 5750 path = alloc_path_for_send(); 5751 if (!path) 5752 return -ENOMEM; 5753 5754 di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, 5755 XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); 5756 if (!di) { 5757 /* There is no xattr for this inode */ 5758 goto out; 5759 } else if (IS_ERR(di)) { 5760 ret = PTR_ERR(di); 5761 goto out; 5762 } 5763 5764 leaf = path->nodes[0]; 5765 buf_len = btrfs_dir_data_len(leaf, di); 5766 5767 buf = kmalloc(buf_len, GFP_KERNEL); 5768 if (!buf) { 5769 ret = -ENOMEM; 5770 goto out; 5771 } 5772 5773 data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); 5774 read_extent_buffer(leaf, buf, data_ptr, buf_len); 5775 5776 ret = send_set_xattr(sctx, XATTR_NAME_CAPS, 5777 strlen(XATTR_NAME_CAPS), buf, buf_len); 5778 out: 5779 kfree(buf); 5780 btrfs_free_path(path); 5781 return ret; 5782 } 5783 5784 static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, 5785 struct clone_root *clone_root, const u64 disk_byte, 5786 u64 data_offset, u64 offset, u64 len) 5787 { 5788 struct btrfs_path *path; 5789 struct btrfs_key key; 5790 int ret; 5791 struct btrfs_inode_info info; 5792 u64 clone_src_i_size = 0; 5793 5794 /* 5795 * Prevent cloning from a zero offset with a length matching the sector 5796 * size because in some scenarios this will make the receiver fail. 5797 * 5798 * For example, if in the source filesystem the extent at offset 0 5799 * has a length of sectorsize and it was written using direct IO, then 5800 * it can never be an inline extent (even if compression is enabled). 5801 * Then this extent can be cloned in the original filesystem to a non 5802 * zero file offset, but it may not be possible to clone in the 5803 * destination filesystem because it can be inlined due to compression 5804 * on the destination filesystem (as the receiver's write operations are 5805 * always done using buffered IO). The same happens when the original 5806 * filesystem does not have compression enabled but the destination 5807 * filesystem has. 5808 */ 5809 if (clone_root->offset == 0 && 5810 len == sctx->send_root->fs_info->sectorsize) 5811 return send_extent_data(sctx, dst_path, offset, len); 5812 5813 path = alloc_path_for_send(); 5814 if (!path) 5815 return -ENOMEM; 5816 5817 /* 5818 * There are inodes that have extents that lie behind its i_size. Don't 5819 * accept clones from these extents. 5820 */ 5821 ret = get_inode_info(clone_root->root, clone_root->ino, &info); 5822 btrfs_release_path(path); 5823 if (ret < 0) 5824 goto out; 5825 clone_src_i_size = info.size; 5826 5827 /* 5828 * We can't send a clone operation for the entire range if we find 5829 * extent items in the respective range in the source file that 5830 * refer to different extents or if we find holes. 5831 * So check for that and do a mix of clone and regular write/copy 5832 * operations if needed. 5833 * 5834 * Example: 5835 * 5836 * mkfs.btrfs -f /dev/sda 5837 * mount /dev/sda /mnt 5838 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo 5839 * cp --reflink=always /mnt/foo /mnt/bar 5840 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo 5841 * btrfs subvolume snapshot -r /mnt /mnt/snap 5842 * 5843 * If when we send the snapshot and we are processing file bar (which 5844 * has a higher inode number than foo) we blindly send a clone operation 5845 * for the [0, 100K[ range from foo to bar, the receiver ends up getting 5846 * a file bar that matches the content of file foo - iow, doesn't match 5847 * the content from bar in the original filesystem. 5848 */ 5849 key.objectid = clone_root->ino; 5850 key.type = BTRFS_EXTENT_DATA_KEY; 5851 key.offset = clone_root->offset; 5852 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); 5853 if (ret < 0) 5854 goto out; 5855 if (ret > 0 && path->slots[0] > 0) { 5856 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 5857 if (key.objectid == clone_root->ino && 5858 key.type == BTRFS_EXTENT_DATA_KEY) 5859 path->slots[0]--; 5860 } 5861 5862 while (true) { 5863 struct extent_buffer *leaf = path->nodes[0]; 5864 int slot = path->slots[0]; 5865 struct btrfs_file_extent_item *ei; 5866 u8 type; 5867 u64 ext_len; 5868 u64 clone_len; 5869 u64 clone_data_offset; 5870 bool crossed_src_i_size = false; 5871 5872 if (slot >= btrfs_header_nritems(leaf)) { 5873 ret = btrfs_next_leaf(clone_root->root, path); 5874 if (ret < 0) 5875 goto out; 5876 else if (ret > 0) 5877 break; 5878 continue; 5879 } 5880 5881 btrfs_item_key_to_cpu(leaf, &key, slot); 5882 5883 /* 5884 * We might have an implicit trailing hole (NO_HOLES feature 5885 * enabled). We deal with it after leaving this loop. 5886 */ 5887 if (key.objectid != clone_root->ino || 5888 key.type != BTRFS_EXTENT_DATA_KEY) 5889 break; 5890 5891 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 5892 type = btrfs_file_extent_type(leaf, ei); 5893 if (type == BTRFS_FILE_EXTENT_INLINE) { 5894 ext_len = btrfs_file_extent_ram_bytes(leaf, ei); 5895 ext_len = PAGE_ALIGN(ext_len); 5896 } else { 5897 ext_len = btrfs_file_extent_num_bytes(leaf, ei); 5898 } 5899 5900 if (key.offset + ext_len <= clone_root->offset) 5901 goto next; 5902 5903 if (key.offset > clone_root->offset) { 5904 /* Implicit hole, NO_HOLES feature enabled. */ 5905 u64 hole_len = key.offset - clone_root->offset; 5906 5907 if (hole_len > len) 5908 hole_len = len; 5909 ret = send_extent_data(sctx, dst_path, offset, 5910 hole_len); 5911 if (ret < 0) 5912 goto out; 5913 5914 len -= hole_len; 5915 if (len == 0) 5916 break; 5917 offset += hole_len; 5918 clone_root->offset += hole_len; 5919 data_offset += hole_len; 5920 } 5921 5922 if (key.offset >= clone_root->offset + len) 5923 break; 5924 5925 if (key.offset >= clone_src_i_size) 5926 break; 5927 5928 if (key.offset + ext_len > clone_src_i_size) { 5929 ext_len = clone_src_i_size - key.offset; 5930 crossed_src_i_size = true; 5931 } 5932 5933 clone_data_offset = btrfs_file_extent_offset(leaf, ei); 5934 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { 5935 clone_root->offset = key.offset; 5936 if (clone_data_offset < data_offset && 5937 clone_data_offset + ext_len > data_offset) { 5938 u64 extent_offset; 5939 5940 extent_offset = data_offset - clone_data_offset; 5941 ext_len -= extent_offset; 5942 clone_data_offset += extent_offset; 5943 clone_root->offset += extent_offset; 5944 } 5945 } 5946 5947 clone_len = min_t(u64, ext_len, len); 5948 5949 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && 5950 clone_data_offset == data_offset) { 5951 const u64 src_end = clone_root->offset + clone_len; 5952 const u64 sectorsize = SZ_64K; 5953 5954 /* 5955 * We can't clone the last block, when its size is not 5956 * sector size aligned, into the middle of a file. If we 5957 * do so, the receiver will get a failure (-EINVAL) when 5958 * trying to clone or will silently corrupt the data in 5959 * the destination file if it's on a kernel without the 5960 * fix introduced by commit ac765f83f1397646 5961 * ("Btrfs: fix data corruption due to cloning of eof 5962 * block). 5963 * 5964 * So issue a clone of the aligned down range plus a 5965 * regular write for the eof block, if we hit that case. 5966 * 5967 * Also, we use the maximum possible sector size, 64K, 5968 * because we don't know what's the sector size of the 5969 * filesystem that receives the stream, so we have to 5970 * assume the largest possible sector size. 5971 */ 5972 if (src_end == clone_src_i_size && 5973 !IS_ALIGNED(src_end, sectorsize) && 5974 offset + clone_len < sctx->cur_inode_size) { 5975 u64 slen; 5976 5977 slen = ALIGN_DOWN(src_end - clone_root->offset, 5978 sectorsize); 5979 if (slen > 0) { 5980 ret = send_clone(sctx, offset, slen, 5981 clone_root); 5982 if (ret < 0) 5983 goto out; 5984 } 5985 ret = send_extent_data(sctx, dst_path, 5986 offset + slen, 5987 clone_len - slen); 5988 } else { 5989 ret = send_clone(sctx, offset, clone_len, 5990 clone_root); 5991 } 5992 } else if (crossed_src_i_size && clone_len < len) { 5993 /* 5994 * If we are at i_size of the clone source inode and we 5995 * can not clone from it, terminate the loop. This is 5996 * to avoid sending two write operations, one with a 5997 * length matching clone_len and the final one after 5998 * this loop with a length of len - clone_len. 5999 * 6000 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED 6001 * was passed to the send ioctl), this helps avoid 6002 * sending an encoded write for an offset that is not 6003 * sector size aligned, in case the i_size of the source 6004 * inode is not sector size aligned. That will make the 6005 * receiver fallback to decompression of the data and 6006 * writing it using regular buffered IO, therefore while 6007 * not incorrect, it's not optimal due decompression and 6008 * possible re-compression at the receiver. 6009 */ 6010 break; 6011 } else { 6012 ret = send_extent_data(sctx, dst_path, offset, 6013 clone_len); 6014 } 6015 6016 if (ret < 0) 6017 goto out; 6018 6019 len -= clone_len; 6020 if (len == 0) 6021 break; 6022 offset += clone_len; 6023 clone_root->offset += clone_len; 6024 6025 /* 6026 * If we are cloning from the file we are currently processing, 6027 * and using the send root as the clone root, we must stop once 6028 * the current clone offset reaches the current eof of the file 6029 * at the receiver, otherwise we would issue an invalid clone 6030 * operation (source range going beyond eof) and cause the 6031 * receiver to fail. So if we reach the current eof, bail out 6032 * and fallback to a regular write. 6033 */ 6034 if (clone_root->root == sctx->send_root && 6035 clone_root->ino == sctx->cur_ino && 6036 clone_root->offset >= sctx->cur_inode_next_write_offset) 6037 break; 6038 6039 data_offset += clone_len; 6040 next: 6041 path->slots[0]++; 6042 } 6043 6044 if (len > 0) 6045 ret = send_extent_data(sctx, dst_path, offset, len); 6046 else 6047 ret = 0; 6048 out: 6049 btrfs_free_path(path); 6050 return ret; 6051 } 6052 6053 static int send_write_or_clone(struct send_ctx *sctx, 6054 struct btrfs_path *path, 6055 struct btrfs_key *key, 6056 struct clone_root *clone_root) 6057 { 6058 int ret = 0; 6059 u64 offset = key->offset; 6060 u64 end; 6061 u64 bs = sctx->send_root->fs_info->sectorsize; 6062 struct btrfs_file_extent_item *ei; 6063 u64 disk_byte; 6064 u64 data_offset; 6065 u64 num_bytes; 6066 struct btrfs_inode_info info = { 0 }; 6067 6068 end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); 6069 if (offset >= end) 6070 return 0; 6071 6072 num_bytes = end - offset; 6073 6074 if (!clone_root) 6075 goto write_data; 6076 6077 if (IS_ALIGNED(end, bs)) 6078 goto clone_data; 6079 6080 /* 6081 * If the extent end is not aligned, we can clone if the extent ends at 6082 * the i_size of the inode and the clone range ends at the i_size of the 6083 * source inode, otherwise the clone operation fails with -EINVAL. 6084 */ 6085 if (end != sctx->cur_inode_size) 6086 goto write_data; 6087 6088 ret = get_inode_info(clone_root->root, clone_root->ino, &info); 6089 if (ret < 0) 6090 return ret; 6091 6092 if (clone_root->offset + num_bytes == info.size) { 6093 /* 6094 * The final size of our file matches the end offset, but it may 6095 * be that its current size is larger, so we have to truncate it 6096 * to any value between the start offset of the range and the 6097 * final i_size, otherwise the clone operation is invalid 6098 * because it's unaligned and it ends before the current EOF. 6099 * We do this truncate to the final i_size when we finish 6100 * processing the inode, but it's too late by then. And here we 6101 * truncate to the start offset of the range because it's always 6102 * sector size aligned while if it were the final i_size it 6103 * would result in dirtying part of a page, filling part of a 6104 * page with zeroes and then having the clone operation at the 6105 * receiver trigger IO and wait for it due to the dirty page. 6106 */ 6107 if (sctx->parent_root != NULL) { 6108 ret = send_truncate(sctx, sctx->cur_ino, 6109 sctx->cur_inode_gen, offset); 6110 if (ret < 0) 6111 return ret; 6112 } 6113 goto clone_data; 6114 } 6115 6116 write_data: 6117 ret = send_extent_data(sctx, path, offset, num_bytes); 6118 sctx->cur_inode_next_write_offset = end; 6119 return ret; 6120 6121 clone_data: 6122 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 6123 struct btrfs_file_extent_item); 6124 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); 6125 data_offset = btrfs_file_extent_offset(path->nodes[0], ei); 6126 ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, 6127 num_bytes); 6128 sctx->cur_inode_next_write_offset = end; 6129 return ret; 6130 } 6131 6132 static int is_extent_unchanged(struct send_ctx *sctx, 6133 struct btrfs_path *left_path, 6134 struct btrfs_key *ekey) 6135 { 6136 int ret = 0; 6137 struct btrfs_key key; 6138 struct btrfs_path *path = NULL; 6139 struct extent_buffer *eb; 6140 int slot; 6141 struct btrfs_key found_key; 6142 struct btrfs_file_extent_item *ei; 6143 u64 left_disknr; 6144 u64 right_disknr; 6145 u64 left_offset; 6146 u64 right_offset; 6147 u64 left_offset_fixed; 6148 u64 left_len; 6149 u64 right_len; 6150 u64 left_gen; 6151 u64 right_gen; 6152 u8 left_type; 6153 u8 right_type; 6154 6155 path = alloc_path_for_send(); 6156 if (!path) 6157 return -ENOMEM; 6158 6159 eb = left_path->nodes[0]; 6160 slot = left_path->slots[0]; 6161 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 6162 left_type = btrfs_file_extent_type(eb, ei); 6163 6164 if (left_type != BTRFS_FILE_EXTENT_REG) { 6165 ret = 0; 6166 goto out; 6167 } 6168 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 6169 left_len = btrfs_file_extent_num_bytes(eb, ei); 6170 left_offset = btrfs_file_extent_offset(eb, ei); 6171 left_gen = btrfs_file_extent_generation(eb, ei); 6172 6173 /* 6174 * Following comments will refer to these graphics. L is the left 6175 * extents which we are checking at the moment. 1-8 are the right 6176 * extents that we iterate. 6177 * 6178 * |-----L-----| 6179 * |-1-|-2a-|-3-|-4-|-5-|-6-| 6180 * 6181 * |-----L-----| 6182 * |--1--|-2b-|...(same as above) 6183 * 6184 * Alternative situation. Happens on files where extents got split. 6185 * |-----L-----| 6186 * |-----------7-----------|-6-| 6187 * 6188 * Alternative situation. Happens on files which got larger. 6189 * |-----L-----| 6190 * |-8-| 6191 * Nothing follows after 8. 6192 */ 6193 6194 key.objectid = ekey->objectid; 6195 key.type = BTRFS_EXTENT_DATA_KEY; 6196 key.offset = ekey->offset; 6197 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0); 6198 if (ret < 0) 6199 goto out; 6200 if (ret) { 6201 ret = 0; 6202 goto out; 6203 } 6204 6205 /* 6206 * Handle special case where the right side has no extents at all. 6207 */ 6208 eb = path->nodes[0]; 6209 slot = path->slots[0]; 6210 btrfs_item_key_to_cpu(eb, &found_key, slot); 6211 if (found_key.objectid != key.objectid || 6212 found_key.type != key.type) { 6213 /* If we're a hole then just pretend nothing changed */ 6214 ret = (left_disknr) ? 0 : 1; 6215 goto out; 6216 } 6217 6218 /* 6219 * We're now on 2a, 2b or 7. 6220 */ 6221 key = found_key; 6222 while (key.offset < ekey->offset + left_len) { 6223 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 6224 right_type = btrfs_file_extent_type(eb, ei); 6225 if (right_type != BTRFS_FILE_EXTENT_REG && 6226 right_type != BTRFS_FILE_EXTENT_INLINE) { 6227 ret = 0; 6228 goto out; 6229 } 6230 6231 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 6232 right_len = btrfs_file_extent_ram_bytes(eb, ei); 6233 right_len = PAGE_ALIGN(right_len); 6234 } else { 6235 right_len = btrfs_file_extent_num_bytes(eb, ei); 6236 } 6237 6238 /* 6239 * Are we at extent 8? If yes, we know the extent is changed. 6240 * This may only happen on the first iteration. 6241 */ 6242 if (found_key.offset + right_len <= ekey->offset) { 6243 /* If we're a hole just pretend nothing changed */ 6244 ret = (left_disknr) ? 0 : 1; 6245 goto out; 6246 } 6247 6248 /* 6249 * We just wanted to see if when we have an inline extent, what 6250 * follows it is a regular extent (wanted to check the above 6251 * condition for inline extents too). This should normally not 6252 * happen but it's possible for example when we have an inline 6253 * compressed extent representing data with a size matching 6254 * the page size (currently the same as sector size). 6255 */ 6256 if (right_type == BTRFS_FILE_EXTENT_INLINE) { 6257 ret = 0; 6258 goto out; 6259 } 6260 6261 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); 6262 right_offset = btrfs_file_extent_offset(eb, ei); 6263 right_gen = btrfs_file_extent_generation(eb, ei); 6264 6265 left_offset_fixed = left_offset; 6266 if (key.offset < ekey->offset) { 6267 /* Fix the right offset for 2a and 7. */ 6268 right_offset += ekey->offset - key.offset; 6269 } else { 6270 /* Fix the left offset for all behind 2a and 2b */ 6271 left_offset_fixed += key.offset - ekey->offset; 6272 } 6273 6274 /* 6275 * Check if we have the same extent. 6276 */ 6277 if (left_disknr != right_disknr || 6278 left_offset_fixed != right_offset || 6279 left_gen != right_gen) { 6280 ret = 0; 6281 goto out; 6282 } 6283 6284 /* 6285 * Go to the next extent. 6286 */ 6287 ret = btrfs_next_item(sctx->parent_root, path); 6288 if (ret < 0) 6289 goto out; 6290 if (!ret) { 6291 eb = path->nodes[0]; 6292 slot = path->slots[0]; 6293 btrfs_item_key_to_cpu(eb, &found_key, slot); 6294 } 6295 if (ret || found_key.objectid != key.objectid || 6296 found_key.type != key.type) { 6297 key.offset += right_len; 6298 break; 6299 } 6300 if (found_key.offset != key.offset + right_len) { 6301 ret = 0; 6302 goto out; 6303 } 6304 key = found_key; 6305 } 6306 6307 /* 6308 * We're now behind the left extent (treat as unchanged) or at the end 6309 * of the right side (treat as changed). 6310 */ 6311 if (key.offset >= ekey->offset + left_len) 6312 ret = 1; 6313 else 6314 ret = 0; 6315 6316 6317 out: 6318 btrfs_free_path(path); 6319 return ret; 6320 } 6321 6322 static int get_last_extent(struct send_ctx *sctx, u64 offset) 6323 { 6324 struct btrfs_path *path; 6325 struct btrfs_root *root = sctx->send_root; 6326 struct btrfs_key key; 6327 int ret; 6328 6329 path = alloc_path_for_send(); 6330 if (!path) 6331 return -ENOMEM; 6332 6333 sctx->cur_inode_last_extent = 0; 6334 6335 key.objectid = sctx->cur_ino; 6336 key.type = BTRFS_EXTENT_DATA_KEY; 6337 key.offset = offset; 6338 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1); 6339 if (ret < 0) 6340 goto out; 6341 ret = 0; 6342 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 6343 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY) 6344 goto out; 6345 6346 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 6347 out: 6348 btrfs_free_path(path); 6349 return ret; 6350 } 6351 6352 static int range_is_hole_in_parent(struct send_ctx *sctx, 6353 const u64 start, 6354 const u64 end) 6355 { 6356 struct btrfs_path *path; 6357 struct btrfs_key key; 6358 struct btrfs_root *root = sctx->parent_root; 6359 u64 search_start = start; 6360 int ret; 6361 6362 path = alloc_path_for_send(); 6363 if (!path) 6364 return -ENOMEM; 6365 6366 key.objectid = sctx->cur_ino; 6367 key.type = BTRFS_EXTENT_DATA_KEY; 6368 key.offset = search_start; 6369 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6370 if (ret < 0) 6371 goto out; 6372 if (ret > 0 && path->slots[0] > 0) 6373 path->slots[0]--; 6374 6375 while (search_start < end) { 6376 struct extent_buffer *leaf = path->nodes[0]; 6377 int slot = path->slots[0]; 6378 struct btrfs_file_extent_item *fi; 6379 u64 extent_end; 6380 6381 if (slot >= btrfs_header_nritems(leaf)) { 6382 ret = btrfs_next_leaf(root, path); 6383 if (ret < 0) 6384 goto out; 6385 else if (ret > 0) 6386 break; 6387 continue; 6388 } 6389 6390 btrfs_item_key_to_cpu(leaf, &key, slot); 6391 if (key.objectid < sctx->cur_ino || 6392 key.type < BTRFS_EXTENT_DATA_KEY) 6393 goto next; 6394 if (key.objectid > sctx->cur_ino || 6395 key.type > BTRFS_EXTENT_DATA_KEY || 6396 key.offset >= end) 6397 break; 6398 6399 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 6400 extent_end = btrfs_file_extent_end(path); 6401 if (extent_end <= start) 6402 goto next; 6403 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) { 6404 search_start = extent_end; 6405 goto next; 6406 } 6407 ret = 0; 6408 goto out; 6409 next: 6410 path->slots[0]++; 6411 } 6412 ret = 1; 6413 out: 6414 btrfs_free_path(path); 6415 return ret; 6416 } 6417 6418 static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, 6419 struct btrfs_key *key) 6420 { 6421 int ret = 0; 6422 6423 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx)) 6424 return 0; 6425 6426 /* 6427 * Get last extent's end offset (exclusive) if we haven't determined it 6428 * yet (we're processing the first file extent item that is new), or if 6429 * we're at the first slot of a leaf and the last extent's end is less 6430 * than the current extent's offset, because we might have skipped 6431 * entire leaves that contained only file extent items for our current 6432 * inode. These leaves have a generation number smaller (older) than the 6433 * one in the current leaf and the leaf our last extent came from, and 6434 * are located between these 2 leaves. 6435 */ 6436 if ((sctx->cur_inode_last_extent == (u64)-1) || 6437 (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) { 6438 ret = get_last_extent(sctx, key->offset - 1); 6439 if (ret) 6440 return ret; 6441 } 6442 6443 if (sctx->cur_inode_last_extent < key->offset) { 6444 ret = range_is_hole_in_parent(sctx, 6445 sctx->cur_inode_last_extent, 6446 key->offset); 6447 if (ret < 0) 6448 return ret; 6449 else if (ret == 0) 6450 ret = send_hole(sctx, key->offset); 6451 else 6452 ret = 0; 6453 } 6454 sctx->cur_inode_last_extent = btrfs_file_extent_end(path); 6455 return ret; 6456 } 6457 6458 static int process_extent(struct send_ctx *sctx, 6459 struct btrfs_path *path, 6460 struct btrfs_key *key) 6461 { 6462 struct clone_root *found_clone = NULL; 6463 int ret = 0; 6464 6465 if (S_ISLNK(sctx->cur_inode_mode)) 6466 return 0; 6467 6468 if (sctx->parent_root && !sctx->cur_inode_new) { 6469 ret = is_extent_unchanged(sctx, path, key); 6470 if (ret < 0) 6471 goto out; 6472 if (ret) { 6473 ret = 0; 6474 goto out_hole; 6475 } 6476 } else { 6477 struct btrfs_file_extent_item *ei; 6478 u8 type; 6479 6480 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 6481 struct btrfs_file_extent_item); 6482 type = btrfs_file_extent_type(path->nodes[0], ei); 6483 if (type == BTRFS_FILE_EXTENT_PREALLOC || 6484 type == BTRFS_FILE_EXTENT_REG) { 6485 /* 6486 * The send spec does not have a prealloc command yet, 6487 * so just leave a hole for prealloc'ed extents until 6488 * we have enough commands queued up to justify rev'ing 6489 * the send spec. 6490 */ 6491 if (type == BTRFS_FILE_EXTENT_PREALLOC) { 6492 ret = 0; 6493 goto out; 6494 } 6495 6496 /* Have a hole, just skip it. */ 6497 if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { 6498 ret = 0; 6499 goto out; 6500 } 6501 } 6502 } 6503 6504 ret = find_extent_clone(sctx, path, key->objectid, key->offset, 6505 sctx->cur_inode_size, &found_clone); 6506 if (ret != -ENOENT && ret < 0) 6507 goto out; 6508 6509 ret = send_write_or_clone(sctx, path, key, found_clone); 6510 if (ret) 6511 goto out; 6512 out_hole: 6513 ret = maybe_send_hole(sctx, path, key); 6514 out: 6515 return ret; 6516 } 6517 6518 static int process_all_extents(struct send_ctx *sctx) 6519 { 6520 int ret = 0; 6521 int iter_ret = 0; 6522 struct btrfs_root *root; 6523 struct btrfs_path *path; 6524 struct btrfs_key key; 6525 struct btrfs_key found_key; 6526 6527 root = sctx->send_root; 6528 path = alloc_path_for_send(); 6529 if (!path) 6530 return -ENOMEM; 6531 6532 key.objectid = sctx->cmp_key->objectid; 6533 key.type = BTRFS_EXTENT_DATA_KEY; 6534 key.offset = 0; 6535 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 6536 if (found_key.objectid != key.objectid || 6537 found_key.type != key.type) { 6538 ret = 0; 6539 break; 6540 } 6541 6542 ret = process_extent(sctx, path, &found_key); 6543 if (ret < 0) 6544 break; 6545 } 6546 /* Catch error found during iteration */ 6547 if (iter_ret < 0) 6548 ret = iter_ret; 6549 6550 btrfs_free_path(path); 6551 return ret; 6552 } 6553 6554 static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end, 6555 int *pending_move, 6556 int *refs_processed) 6557 { 6558 int ret = 0; 6559 6560 if (sctx->cur_ino == 0) 6561 goto out; 6562 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && 6563 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) 6564 goto out; 6565 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 6566 goto out; 6567 6568 ret = process_recorded_refs(sctx, pending_move); 6569 if (ret < 0) 6570 goto out; 6571 6572 *refs_processed = 1; 6573 out: 6574 return ret; 6575 } 6576 6577 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) 6578 { 6579 int ret = 0; 6580 struct btrfs_inode_info info; 6581 u64 left_mode; 6582 u64 left_uid; 6583 u64 left_gid; 6584 u64 left_fileattr; 6585 u64 right_mode; 6586 u64 right_uid; 6587 u64 right_gid; 6588 u64 right_fileattr; 6589 int need_chmod = 0; 6590 int need_chown = 0; 6591 bool need_fileattr = false; 6592 int need_truncate = 1; 6593 int pending_move = 0; 6594 int refs_processed = 0; 6595 6596 if (sctx->ignore_cur_inode) 6597 return 0; 6598 6599 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move, 6600 &refs_processed); 6601 if (ret < 0) 6602 goto out; 6603 6604 /* 6605 * We have processed the refs and thus need to advance send_progress. 6606 * Now, calls to get_cur_xxx will take the updated refs of the current 6607 * inode into account. 6608 * 6609 * On the other hand, if our current inode is a directory and couldn't 6610 * be moved/renamed because its parent was renamed/moved too and it has 6611 * a higher inode number, we can only move/rename our current inode 6612 * after we moved/renamed its parent. Therefore in this case operate on 6613 * the old path (pre move/rename) of our current inode, and the 6614 * move/rename will be performed later. 6615 */ 6616 if (refs_processed && !pending_move) 6617 sctx->send_progress = sctx->cur_ino + 1; 6618 6619 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) 6620 goto out; 6621 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) 6622 goto out; 6623 ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info); 6624 if (ret < 0) 6625 goto out; 6626 left_mode = info.mode; 6627 left_uid = info.uid; 6628 left_gid = info.gid; 6629 left_fileattr = info.fileattr; 6630 6631 if (!sctx->parent_root || sctx->cur_inode_new) { 6632 need_chown = 1; 6633 if (!S_ISLNK(sctx->cur_inode_mode)) 6634 need_chmod = 1; 6635 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size) 6636 need_truncate = 0; 6637 } else { 6638 u64 old_size; 6639 6640 ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info); 6641 if (ret < 0) 6642 goto out; 6643 old_size = info.size; 6644 right_mode = info.mode; 6645 right_uid = info.uid; 6646 right_gid = info.gid; 6647 right_fileattr = info.fileattr; 6648 6649 if (left_uid != right_uid || left_gid != right_gid) 6650 need_chown = 1; 6651 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) 6652 need_chmod = 1; 6653 if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) 6654 need_fileattr = true; 6655 if ((old_size == sctx->cur_inode_size) || 6656 (sctx->cur_inode_size > old_size && 6657 sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) 6658 need_truncate = 0; 6659 } 6660 6661 if (S_ISREG(sctx->cur_inode_mode)) { 6662 if (need_send_hole(sctx)) { 6663 if (sctx->cur_inode_last_extent == (u64)-1 || 6664 sctx->cur_inode_last_extent < 6665 sctx->cur_inode_size) { 6666 ret = get_last_extent(sctx, (u64)-1); 6667 if (ret) 6668 goto out; 6669 } 6670 if (sctx->cur_inode_last_extent < sctx->cur_inode_size) { 6671 ret = range_is_hole_in_parent(sctx, 6672 sctx->cur_inode_last_extent, 6673 sctx->cur_inode_size); 6674 if (ret < 0) { 6675 goto out; 6676 } else if (ret == 0) { 6677 ret = send_hole(sctx, sctx->cur_inode_size); 6678 if (ret < 0) 6679 goto out; 6680 } else { 6681 /* Range is already a hole, skip. */ 6682 ret = 0; 6683 } 6684 } 6685 } 6686 if (need_truncate) { 6687 ret = send_truncate(sctx, sctx->cur_ino, 6688 sctx->cur_inode_gen, 6689 sctx->cur_inode_size); 6690 if (ret < 0) 6691 goto out; 6692 } 6693 } 6694 6695 if (need_chown) { 6696 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6697 left_uid, left_gid); 6698 if (ret < 0) 6699 goto out; 6700 } 6701 if (need_chmod) { 6702 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6703 left_mode); 6704 if (ret < 0) 6705 goto out; 6706 } 6707 if (need_fileattr) { 6708 ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, 6709 left_fileattr); 6710 if (ret < 0) 6711 goto out; 6712 } 6713 6714 if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY) 6715 && sctx->cur_inode_needs_verity) { 6716 ret = process_verity(sctx); 6717 if (ret < 0) 6718 goto out; 6719 } 6720 6721 ret = send_capabilities(sctx); 6722 if (ret < 0) 6723 goto out; 6724 6725 /* 6726 * If other directory inodes depended on our current directory 6727 * inode's move/rename, now do their move/rename operations. 6728 */ 6729 if (!is_waiting_for_move(sctx, sctx->cur_ino)) { 6730 ret = apply_children_dir_moves(sctx); 6731 if (ret) 6732 goto out; 6733 /* 6734 * Need to send that every time, no matter if it actually 6735 * changed between the two trees as we have done changes to 6736 * the inode before. If our inode is a directory and it's 6737 * waiting to be moved/renamed, we will send its utimes when 6738 * it's moved/renamed, therefore we don't need to do it here. 6739 */ 6740 sctx->send_progress = sctx->cur_ino + 1; 6741 6742 /* 6743 * If the current inode is a non-empty directory, delay issuing 6744 * the utimes command for it, as it's very likely we have inodes 6745 * with an higher number inside it. We want to issue the utimes 6746 * command only after adding all dentries to it. 6747 */ 6748 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0) 6749 ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); 6750 else 6751 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); 6752 6753 if (ret < 0) 6754 goto out; 6755 } 6756 6757 out: 6758 if (!ret) 6759 ret = trim_dir_utimes_cache(sctx); 6760 6761 return ret; 6762 } 6763 6764 static void close_current_inode(struct send_ctx *sctx) 6765 { 6766 u64 i_size; 6767 6768 if (sctx->cur_inode == NULL) 6769 return; 6770 6771 i_size = i_size_read(sctx->cur_inode); 6772 6773 /* 6774 * If we are doing an incremental send, we may have extents between the 6775 * last processed extent and the i_size that have not been processed 6776 * because they haven't changed but we may have read some of their pages 6777 * through readahead, see the comments at send_extent_data(). 6778 */ 6779 if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) 6780 truncate_inode_pages_range(&sctx->cur_inode->i_data, 6781 sctx->page_cache_clear_start, 6782 round_up(i_size, PAGE_SIZE) - 1); 6783 6784 iput(sctx->cur_inode); 6785 sctx->cur_inode = NULL; 6786 } 6787 6788 static int changed_inode(struct send_ctx *sctx, 6789 enum btrfs_compare_tree_result result) 6790 { 6791 int ret = 0; 6792 struct btrfs_key *key = sctx->cmp_key; 6793 struct btrfs_inode_item *left_ii = NULL; 6794 struct btrfs_inode_item *right_ii = NULL; 6795 u64 left_gen = 0; 6796 u64 right_gen = 0; 6797 6798 close_current_inode(sctx); 6799 6800 sctx->cur_ino = key->objectid; 6801 sctx->cur_inode_new_gen = false; 6802 sctx->cur_inode_last_extent = (u64)-1; 6803 sctx->cur_inode_next_write_offset = 0; 6804 sctx->ignore_cur_inode = false; 6805 fs_path_reset(&sctx->cur_inode_path); 6806 6807 /* 6808 * Set send_progress to current inode. This will tell all get_cur_xxx 6809 * functions that the current inode's refs are not updated yet. Later, 6810 * when process_recorded_refs is finished, it is set to cur_ino + 1. 6811 */ 6812 sctx->send_progress = sctx->cur_ino; 6813 6814 if (result == BTRFS_COMPARE_TREE_NEW || 6815 result == BTRFS_COMPARE_TREE_CHANGED) { 6816 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0], 6817 sctx->left_path->slots[0], 6818 struct btrfs_inode_item); 6819 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0], 6820 left_ii); 6821 } else { 6822 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6823 sctx->right_path->slots[0], 6824 struct btrfs_inode_item); 6825 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6826 right_ii); 6827 } 6828 if (result == BTRFS_COMPARE_TREE_CHANGED) { 6829 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0], 6830 sctx->right_path->slots[0], 6831 struct btrfs_inode_item); 6832 6833 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0], 6834 right_ii); 6835 6836 /* 6837 * The cur_ino = root dir case is special here. We can't treat 6838 * the inode as deleted+reused because it would generate a 6839 * stream that tries to delete/mkdir the root dir. 6840 */ 6841 if (left_gen != right_gen && 6842 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6843 sctx->cur_inode_new_gen = true; 6844 } 6845 6846 /* 6847 * Normally we do not find inodes with a link count of zero (orphans) 6848 * because the most common case is to create a snapshot and use it 6849 * for a send operation. However other less common use cases involve 6850 * using a subvolume and send it after turning it to RO mode just 6851 * after deleting all hard links of a file while holding an open 6852 * file descriptor against it or turning a RO snapshot into RW mode, 6853 * keep an open file descriptor against a file, delete it and then 6854 * turn the snapshot back to RO mode before using it for a send 6855 * operation. The former is what the receiver operation does. 6856 * Therefore, if we want to send these snapshots soon after they're 6857 * received, we need to handle orphan inodes as well. Moreover, orphans 6858 * can appear not only in the send snapshot but also in the parent 6859 * snapshot. Here are several cases: 6860 * 6861 * Case 1: BTRFS_COMPARE_TREE_NEW 6862 * | send snapshot | action 6863 * -------------------------------- 6864 * nlink | 0 | ignore 6865 * 6866 * Case 2: BTRFS_COMPARE_TREE_DELETED 6867 * | parent snapshot | action 6868 * ---------------------------------- 6869 * nlink | 0 | as usual 6870 * Note: No unlinks will be sent because there're no paths for it. 6871 * 6872 * Case 3: BTRFS_COMPARE_TREE_CHANGED 6873 * | | parent snapshot | send snapshot | action 6874 * ----------------------------------------------------------------------- 6875 * subcase 1 | nlink | 0 | 0 | ignore 6876 * subcase 2 | nlink | >0 | 0 | new_gen(deletion) 6877 * subcase 3 | nlink | 0 | >0 | new_gen(creation) 6878 * 6879 */ 6880 if (result == BTRFS_COMPARE_TREE_NEW) { 6881 if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { 6882 sctx->ignore_cur_inode = true; 6883 goto out; 6884 } 6885 sctx->cur_inode_gen = left_gen; 6886 sctx->cur_inode_new = true; 6887 sctx->cur_inode_deleted = false; 6888 sctx->cur_inode_size = btrfs_inode_size( 6889 sctx->left_path->nodes[0], left_ii); 6890 sctx->cur_inode_mode = btrfs_inode_mode( 6891 sctx->left_path->nodes[0], left_ii); 6892 sctx->cur_inode_rdev = btrfs_inode_rdev( 6893 sctx->left_path->nodes[0], left_ii); 6894 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) 6895 ret = send_create_inode_if_needed(sctx); 6896 } else if (result == BTRFS_COMPARE_TREE_DELETED) { 6897 sctx->cur_inode_gen = right_gen; 6898 sctx->cur_inode_new = false; 6899 sctx->cur_inode_deleted = true; 6900 sctx->cur_inode_size = btrfs_inode_size( 6901 sctx->right_path->nodes[0], right_ii); 6902 sctx->cur_inode_mode = btrfs_inode_mode( 6903 sctx->right_path->nodes[0], right_ii); 6904 } else if (result == BTRFS_COMPARE_TREE_CHANGED) { 6905 u32 new_nlinks, old_nlinks; 6906 6907 new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii); 6908 old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); 6909 if (new_nlinks == 0 && old_nlinks == 0) { 6910 sctx->ignore_cur_inode = true; 6911 goto out; 6912 } else if (new_nlinks == 0 || old_nlinks == 0) { 6913 sctx->cur_inode_new_gen = 1; 6914 } 6915 /* 6916 * We need to do some special handling in case the inode was 6917 * reported as changed with a changed generation number. This 6918 * means that the original inode was deleted and new inode 6919 * reused the same inum. So we have to treat the old inode as 6920 * deleted and the new one as new. 6921 */ 6922 if (sctx->cur_inode_new_gen) { 6923 /* 6924 * First, process the inode as if it was deleted. 6925 */ 6926 if (old_nlinks > 0) { 6927 sctx->cur_inode_gen = right_gen; 6928 sctx->cur_inode_new = false; 6929 sctx->cur_inode_deleted = true; 6930 sctx->cur_inode_size = btrfs_inode_size( 6931 sctx->right_path->nodes[0], right_ii); 6932 sctx->cur_inode_mode = btrfs_inode_mode( 6933 sctx->right_path->nodes[0], right_ii); 6934 ret = process_all_refs(sctx, 6935 BTRFS_COMPARE_TREE_DELETED); 6936 if (ret < 0) 6937 goto out; 6938 } 6939 6940 /* 6941 * Now process the inode as if it was new. 6942 */ 6943 if (new_nlinks > 0) { 6944 sctx->cur_inode_gen = left_gen; 6945 sctx->cur_inode_new = true; 6946 sctx->cur_inode_deleted = false; 6947 sctx->cur_inode_size = btrfs_inode_size( 6948 sctx->left_path->nodes[0], 6949 left_ii); 6950 sctx->cur_inode_mode = btrfs_inode_mode( 6951 sctx->left_path->nodes[0], 6952 left_ii); 6953 sctx->cur_inode_rdev = btrfs_inode_rdev( 6954 sctx->left_path->nodes[0], 6955 left_ii); 6956 ret = send_create_inode_if_needed(sctx); 6957 if (ret < 0) 6958 goto out; 6959 6960 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 6961 if (ret < 0) 6962 goto out; 6963 /* 6964 * Advance send_progress now as we did not get 6965 * into process_recorded_refs_if_needed in the 6966 * new_gen case. 6967 */ 6968 sctx->send_progress = sctx->cur_ino + 1; 6969 6970 /* 6971 * Now process all extents and xattrs of the 6972 * inode as if they were all new. 6973 */ 6974 ret = process_all_extents(sctx); 6975 if (ret < 0) 6976 goto out; 6977 ret = process_all_new_xattrs(sctx); 6978 if (ret < 0) 6979 goto out; 6980 } 6981 } else { 6982 sctx->cur_inode_gen = left_gen; 6983 sctx->cur_inode_new = false; 6984 sctx->cur_inode_new_gen = false; 6985 sctx->cur_inode_deleted = false; 6986 sctx->cur_inode_size = btrfs_inode_size( 6987 sctx->left_path->nodes[0], left_ii); 6988 sctx->cur_inode_mode = btrfs_inode_mode( 6989 sctx->left_path->nodes[0], left_ii); 6990 } 6991 } 6992 6993 out: 6994 return ret; 6995 } 6996 6997 /* 6998 * We have to process new refs before deleted refs, but compare_trees gives us 6999 * the new and deleted refs mixed. To fix this, we record the new/deleted refs 7000 * first and later process them in process_recorded_refs. 7001 * For the cur_inode_new_gen case, we skip recording completely because 7002 * changed_inode did already initiate processing of refs. The reason for this is 7003 * that in this case, compare_tree actually compares the refs of 2 different 7004 * inodes. To fix this, process_all_refs is used in changed_inode to handle all 7005 * refs of the right tree as deleted and all refs of the left tree as new. 7006 */ 7007 static int changed_ref(struct send_ctx *sctx, 7008 enum btrfs_compare_tree_result result) 7009 { 7010 int ret = 0; 7011 7012 if (sctx->cur_ino != sctx->cmp_key->objectid) { 7013 inconsistent_snapshot_error(sctx, result, "reference"); 7014 return -EIO; 7015 } 7016 7017 if (!sctx->cur_inode_new_gen && 7018 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { 7019 if (result == BTRFS_COMPARE_TREE_NEW) 7020 ret = record_new_ref(sctx); 7021 else if (result == BTRFS_COMPARE_TREE_DELETED) 7022 ret = record_deleted_ref(sctx); 7023 else if (result == BTRFS_COMPARE_TREE_CHANGED) 7024 ret = record_changed_ref(sctx); 7025 } 7026 7027 return ret; 7028 } 7029 7030 /* 7031 * Process new/deleted/changed xattrs. We skip processing in the 7032 * cur_inode_new_gen case because changed_inode did already initiate processing 7033 * of xattrs. The reason is the same as in changed_ref 7034 */ 7035 static int changed_xattr(struct send_ctx *sctx, 7036 enum btrfs_compare_tree_result result) 7037 { 7038 int ret = 0; 7039 7040 if (sctx->cur_ino != sctx->cmp_key->objectid) { 7041 inconsistent_snapshot_error(sctx, result, "xattr"); 7042 return -EIO; 7043 } 7044 7045 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7046 if (result == BTRFS_COMPARE_TREE_NEW) 7047 ret = process_new_xattr(sctx); 7048 else if (result == BTRFS_COMPARE_TREE_DELETED) 7049 ret = process_deleted_xattr(sctx); 7050 else if (result == BTRFS_COMPARE_TREE_CHANGED) 7051 ret = process_changed_xattr(sctx); 7052 } 7053 7054 return ret; 7055 } 7056 7057 /* 7058 * Process new/deleted/changed extents. We skip processing in the 7059 * cur_inode_new_gen case because changed_inode did already initiate processing 7060 * of extents. The reason is the same as in changed_ref 7061 */ 7062 static int changed_extent(struct send_ctx *sctx, 7063 enum btrfs_compare_tree_result result) 7064 { 7065 int ret = 0; 7066 7067 /* 7068 * We have found an extent item that changed without the inode item 7069 * having changed. This can happen either after relocation (where the 7070 * disk_bytenr of an extent item is replaced at 7071 * relocation.c:replace_file_extents()) or after deduplication into a 7072 * file in both the parent and send snapshots (where an extent item can 7073 * get modified or replaced with a new one). Note that deduplication 7074 * updates the inode item, but it only changes the iversion (sequence 7075 * field in the inode item) of the inode, so if a file is deduplicated 7076 * the same amount of times in both the parent and send snapshots, its 7077 * iversion becomes the same in both snapshots, whence the inode item is 7078 * the same on both snapshots. 7079 */ 7080 if (sctx->cur_ino != sctx->cmp_key->objectid) 7081 return 0; 7082 7083 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7084 if (result != BTRFS_COMPARE_TREE_DELETED) 7085 ret = process_extent(sctx, sctx->left_path, 7086 sctx->cmp_key); 7087 } 7088 7089 return ret; 7090 } 7091 7092 static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result) 7093 { 7094 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 7095 if (result == BTRFS_COMPARE_TREE_NEW) 7096 sctx->cur_inode_needs_verity = true; 7097 } 7098 return 0; 7099 } 7100 7101 static int dir_changed(struct send_ctx *sctx, u64 dir) 7102 { 7103 u64 orig_gen, new_gen; 7104 int ret; 7105 7106 ret = get_inode_gen(sctx->send_root, dir, &new_gen); 7107 if (ret) 7108 return ret; 7109 7110 ret = get_inode_gen(sctx->parent_root, dir, &orig_gen); 7111 if (ret) 7112 return ret; 7113 7114 return (orig_gen != new_gen) ? 1 : 0; 7115 } 7116 7117 static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, 7118 struct btrfs_key *key) 7119 { 7120 struct btrfs_inode_extref *extref; 7121 struct extent_buffer *leaf; 7122 u64 dirid = 0, last_dirid = 0; 7123 unsigned long ptr; 7124 u32 item_size; 7125 u32 cur_offset = 0; 7126 int ref_name_len; 7127 int ret = 0; 7128 7129 /* Easy case, just check this one dirid */ 7130 if (key->type == BTRFS_INODE_REF_KEY) { 7131 dirid = key->offset; 7132 7133 ret = dir_changed(sctx, dirid); 7134 goto out; 7135 } 7136 7137 leaf = path->nodes[0]; 7138 item_size = btrfs_item_size(leaf, path->slots[0]); 7139 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 7140 while (cur_offset < item_size) { 7141 extref = (struct btrfs_inode_extref *)(ptr + 7142 cur_offset); 7143 dirid = btrfs_inode_extref_parent(leaf, extref); 7144 ref_name_len = btrfs_inode_extref_name_len(leaf, extref); 7145 cur_offset += ref_name_len + sizeof(*extref); 7146 if (dirid == last_dirid) 7147 continue; 7148 ret = dir_changed(sctx, dirid); 7149 if (ret) 7150 break; 7151 last_dirid = dirid; 7152 } 7153 out: 7154 return ret; 7155 } 7156 7157 /* 7158 * Updates compare related fields in sctx and simply forwards to the actual 7159 * changed_xxx functions. 7160 */ 7161 static int changed_cb(struct btrfs_path *left_path, 7162 struct btrfs_path *right_path, 7163 struct btrfs_key *key, 7164 enum btrfs_compare_tree_result result, 7165 struct send_ctx *sctx) 7166 { 7167 int ret; 7168 7169 /* 7170 * We can not hold the commit root semaphore here. This is because in 7171 * the case of sending and receiving to the same filesystem, using a 7172 * pipe, could result in a deadlock: 7173 * 7174 * 1) The task running send blocks on the pipe because it's full; 7175 * 7176 * 2) The task running receive, which is the only consumer of the pipe, 7177 * is waiting for a transaction commit (for example due to a space 7178 * reservation when doing a write or triggering a transaction commit 7179 * when creating a subvolume); 7180 * 7181 * 3) The transaction is waiting to write lock the commit root semaphore, 7182 * but can not acquire it since it's being held at 1). 7183 * 7184 * Down this call chain we write to the pipe through kernel_write(). 7185 * The same type of problem can also happen when sending to a file that 7186 * is stored in the same filesystem - when reserving space for a write 7187 * into the file, we can trigger a transaction commit. 7188 * 7189 * Our caller has supplied us with clones of leaves from the send and 7190 * parent roots, so we're safe here from a concurrent relocation and 7191 * further reallocation of metadata extents while we are here. Below we 7192 * also assert that the leaves are clones. 7193 */ 7194 lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); 7195 7196 /* 7197 * We always have a send root, so left_path is never NULL. We will not 7198 * have a leaf when we have reached the end of the send root but have 7199 * not yet reached the end of the parent root. 7200 */ 7201 if (left_path->nodes[0]) 7202 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, 7203 &left_path->nodes[0]->bflags)); 7204 /* 7205 * When doing a full send we don't have a parent root, so right_path is 7206 * NULL. When doing an incremental send, we may have reached the end of 7207 * the parent root already, so we don't have a leaf at right_path. 7208 */ 7209 if (right_path && right_path->nodes[0]) 7210 ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, 7211 &right_path->nodes[0]->bflags)); 7212 7213 if (result == BTRFS_COMPARE_TREE_SAME) { 7214 if (key->type == BTRFS_INODE_REF_KEY || 7215 key->type == BTRFS_INODE_EXTREF_KEY) { 7216 ret = compare_refs(sctx, left_path, key); 7217 if (!ret) 7218 return 0; 7219 if (ret < 0) 7220 return ret; 7221 } else if (key->type == BTRFS_EXTENT_DATA_KEY) { 7222 return maybe_send_hole(sctx, left_path, key); 7223 } else { 7224 return 0; 7225 } 7226 result = BTRFS_COMPARE_TREE_CHANGED; 7227 } 7228 7229 sctx->left_path = left_path; 7230 sctx->right_path = right_path; 7231 sctx->cmp_key = key; 7232 7233 ret = finish_inode_if_needed(sctx, 0); 7234 if (ret < 0) 7235 goto out; 7236 7237 /* Ignore non-FS objects */ 7238 if (key->objectid == BTRFS_FREE_INO_OBJECTID || 7239 key->objectid == BTRFS_FREE_SPACE_OBJECTID) 7240 goto out; 7241 7242 if (key->type == BTRFS_INODE_ITEM_KEY) { 7243 ret = changed_inode(sctx, result); 7244 } else if (!sctx->ignore_cur_inode) { 7245 if (key->type == BTRFS_INODE_REF_KEY || 7246 key->type == BTRFS_INODE_EXTREF_KEY) 7247 ret = changed_ref(sctx, result); 7248 else if (key->type == BTRFS_XATTR_ITEM_KEY) 7249 ret = changed_xattr(sctx, result); 7250 else if (key->type == BTRFS_EXTENT_DATA_KEY) 7251 ret = changed_extent(sctx, result); 7252 else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY && 7253 key->offset == 0) 7254 ret = changed_verity(sctx, result); 7255 } 7256 7257 out: 7258 return ret; 7259 } 7260 7261 static int search_key_again(const struct send_ctx *sctx, 7262 struct btrfs_root *root, 7263 struct btrfs_path *path, 7264 const struct btrfs_key *key) 7265 { 7266 int ret; 7267 7268 if (!path->need_commit_sem) 7269 lockdep_assert_held_read(&root->fs_info->commit_root_sem); 7270 7271 /* 7272 * Roots used for send operations are readonly and no one can add, 7273 * update or remove keys from them, so we should be able to find our 7274 * key again. The only exception is deduplication, which can operate on 7275 * readonly roots and add, update or remove keys to/from them - but at 7276 * the moment we don't allow it to run in parallel with send. 7277 */ 7278 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 7279 ASSERT(ret <= 0); 7280 if (ret > 0) { 7281 btrfs_print_tree(path->nodes[path->lowest_level], false); 7282 btrfs_err(root->fs_info, 7283 "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", 7284 key->objectid, key->type, key->offset, 7285 (root == sctx->parent_root ? "parent" : "send"), 7286 btrfs_root_id(root), path->lowest_level, 7287 path->slots[path->lowest_level]); 7288 return -EUCLEAN; 7289 } 7290 7291 return ret; 7292 } 7293 7294 static int full_send_tree(struct send_ctx *sctx) 7295 { 7296 int ret; 7297 struct btrfs_root *send_root = sctx->send_root; 7298 struct btrfs_key key; 7299 struct btrfs_fs_info *fs_info = send_root->fs_info; 7300 struct btrfs_path *path; 7301 7302 path = alloc_path_for_send(); 7303 if (!path) 7304 return -ENOMEM; 7305 path->reada = READA_FORWARD_ALWAYS; 7306 7307 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 7308 key.type = BTRFS_INODE_ITEM_KEY; 7309 key.offset = 0; 7310 7311 down_read(&fs_info->commit_root_sem); 7312 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7313 up_read(&fs_info->commit_root_sem); 7314 7315 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); 7316 if (ret < 0) 7317 goto out; 7318 if (ret) 7319 goto out_finish; 7320 7321 while (1) { 7322 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 7323 7324 ret = changed_cb(path, NULL, &key, 7325 BTRFS_COMPARE_TREE_NEW, sctx); 7326 if (ret < 0) 7327 goto out; 7328 7329 down_read(&fs_info->commit_root_sem); 7330 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 7331 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7332 up_read(&fs_info->commit_root_sem); 7333 /* 7334 * A transaction used for relocating a block group was 7335 * committed or is about to finish its commit. Release 7336 * our path (leaf) and restart the search, so that we 7337 * avoid operating on any file extent items that are 7338 * stale, with a disk_bytenr that reflects a pre 7339 * relocation value. This way we avoid as much as 7340 * possible to fallback to regular writes when checking 7341 * if we can clone file ranges. 7342 */ 7343 btrfs_release_path(path); 7344 ret = search_key_again(sctx, send_root, path, &key); 7345 if (ret < 0) 7346 goto out; 7347 } else { 7348 up_read(&fs_info->commit_root_sem); 7349 } 7350 7351 ret = btrfs_next_item(send_root, path); 7352 if (ret < 0) 7353 goto out; 7354 if (ret) { 7355 ret = 0; 7356 break; 7357 } 7358 } 7359 7360 out_finish: 7361 ret = finish_inode_if_needed(sctx, 1); 7362 7363 out: 7364 btrfs_free_path(path); 7365 return ret; 7366 } 7367 7368 static int replace_node_with_clone(struct btrfs_path *path, int level) 7369 { 7370 struct extent_buffer *clone; 7371 7372 clone = btrfs_clone_extent_buffer(path->nodes[level]); 7373 if (!clone) 7374 return -ENOMEM; 7375 7376 free_extent_buffer(path->nodes[level]); 7377 path->nodes[level] = clone; 7378 7379 return 0; 7380 } 7381 7382 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) 7383 { 7384 struct extent_buffer *eb; 7385 struct extent_buffer *parent = path->nodes[*level]; 7386 int slot = path->slots[*level]; 7387 const int nritems = btrfs_header_nritems(parent); 7388 u64 reada_max; 7389 u64 reada_done = 0; 7390 7391 lockdep_assert_held_read(&parent->fs_info->commit_root_sem); 7392 ASSERT(*level != 0); 7393 7394 eb = btrfs_read_node_slot(parent, slot); 7395 if (IS_ERR(eb)) 7396 return PTR_ERR(eb); 7397 7398 /* 7399 * Trigger readahead for the next leaves we will process, so that it is 7400 * very likely that when we need them they are already in memory and we 7401 * will not block on disk IO. For nodes we only do readahead for one, 7402 * since the time window between processing nodes is typically larger. 7403 */ 7404 reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); 7405 7406 for (slot++; slot < nritems && reada_done < reada_max; slot++) { 7407 if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { 7408 btrfs_readahead_node_child(parent, slot); 7409 reada_done += eb->fs_info->nodesize; 7410 } 7411 } 7412 7413 path->nodes[*level - 1] = eb; 7414 path->slots[*level - 1] = 0; 7415 (*level)--; 7416 7417 if (*level == 0) 7418 return replace_node_with_clone(path, 0); 7419 7420 return 0; 7421 } 7422 7423 static int tree_move_next_or_upnext(struct btrfs_path *path, 7424 int *level, int root_level) 7425 { 7426 int ret = 0; 7427 int nritems; 7428 nritems = btrfs_header_nritems(path->nodes[*level]); 7429 7430 path->slots[*level]++; 7431 7432 while (path->slots[*level] >= nritems) { 7433 if (*level == root_level) { 7434 path->slots[*level] = nritems - 1; 7435 return -1; 7436 } 7437 7438 /* move upnext */ 7439 path->slots[*level] = 0; 7440 free_extent_buffer(path->nodes[*level]); 7441 path->nodes[*level] = NULL; 7442 (*level)++; 7443 path->slots[*level]++; 7444 7445 nritems = btrfs_header_nritems(path->nodes[*level]); 7446 ret = 1; 7447 } 7448 return ret; 7449 } 7450 7451 /* 7452 * Returns 1 if it had to move up and next. 0 is returned if it moved only next 7453 * or down. 7454 */ 7455 static int tree_advance(struct btrfs_path *path, 7456 int *level, int root_level, 7457 int allow_down, 7458 struct btrfs_key *key, 7459 u64 reada_min_gen) 7460 { 7461 int ret; 7462 7463 if (*level == 0 || !allow_down) { 7464 ret = tree_move_next_or_upnext(path, level, root_level); 7465 } else { 7466 ret = tree_move_down(path, level, reada_min_gen); 7467 } 7468 7469 /* 7470 * Even if we have reached the end of a tree, ret is -1, update the key 7471 * anyway, so that in case we need to restart due to a block group 7472 * relocation, we can assert that the last key of the root node still 7473 * exists in the tree. 7474 */ 7475 if (*level == 0) 7476 btrfs_item_key_to_cpu(path->nodes[*level], key, 7477 path->slots[*level]); 7478 else 7479 btrfs_node_key_to_cpu(path->nodes[*level], key, 7480 path->slots[*level]); 7481 7482 return ret; 7483 } 7484 7485 static int tree_compare_item(struct btrfs_path *left_path, 7486 struct btrfs_path *right_path, 7487 char *tmp_buf) 7488 { 7489 int cmp; 7490 int len1, len2; 7491 unsigned long off1, off2; 7492 7493 len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]); 7494 len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]); 7495 if (len1 != len2) 7496 return 1; 7497 7498 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]); 7499 off2 = btrfs_item_ptr_offset(right_path->nodes[0], 7500 right_path->slots[0]); 7501 7502 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1); 7503 7504 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1); 7505 if (cmp) 7506 return 1; 7507 return 0; 7508 } 7509 7510 /* 7511 * A transaction used for relocating a block group was committed or is about to 7512 * finish its commit. Release our paths and restart the search, so that we are 7513 * not using stale extent buffers: 7514 * 7515 * 1) For levels > 0, we are only holding references of extent buffers, without 7516 * any locks on them, which does not prevent them from having been relocated 7517 * and reallocated after the last time we released the commit root semaphore. 7518 * The exception are the root nodes, for which we always have a clone, see 7519 * the comment at btrfs_compare_trees(); 7520 * 7521 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so 7522 * we are safe from the concurrent relocation and reallocation. However they 7523 * can have file extent items with a pre relocation disk_bytenr value, so we 7524 * restart the start from the current commit roots and clone the new leaves so 7525 * that we get the post relocation disk_bytenr values. Not doing so, could 7526 * make us clone the wrong data in case there are new extents using the old 7527 * disk_bytenr that happen to be shared. 7528 */ 7529 static int restart_after_relocation(struct btrfs_path *left_path, 7530 struct btrfs_path *right_path, 7531 const struct btrfs_key *left_key, 7532 const struct btrfs_key *right_key, 7533 int left_level, 7534 int right_level, 7535 const struct send_ctx *sctx) 7536 { 7537 int root_level; 7538 int ret; 7539 7540 lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); 7541 7542 btrfs_release_path(left_path); 7543 btrfs_release_path(right_path); 7544 7545 /* 7546 * Since keys can not be added or removed to/from our roots because they 7547 * are readonly and we do not allow deduplication to run in parallel 7548 * (which can add, remove or change keys), the layout of the trees should 7549 * not change. 7550 */ 7551 left_path->lowest_level = left_level; 7552 ret = search_key_again(sctx, sctx->send_root, left_path, left_key); 7553 if (ret < 0) 7554 return ret; 7555 7556 right_path->lowest_level = right_level; 7557 ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); 7558 if (ret < 0) 7559 return ret; 7560 7561 /* 7562 * If the lowest level nodes are leaves, clone them so that they can be 7563 * safely used by changed_cb() while not under the protection of the 7564 * commit root semaphore, even if relocation and reallocation happens in 7565 * parallel. 7566 */ 7567 if (left_level == 0) { 7568 ret = replace_node_with_clone(left_path, 0); 7569 if (ret < 0) 7570 return ret; 7571 } 7572 7573 if (right_level == 0) { 7574 ret = replace_node_with_clone(right_path, 0); 7575 if (ret < 0) 7576 return ret; 7577 } 7578 7579 /* 7580 * Now clone the root nodes (unless they happen to be the leaves we have 7581 * already cloned). This is to protect against concurrent snapshotting of 7582 * the send and parent roots (see the comment at btrfs_compare_trees()). 7583 */ 7584 root_level = btrfs_header_level(sctx->send_root->commit_root); 7585 if (root_level > 0) { 7586 ret = replace_node_with_clone(left_path, root_level); 7587 if (ret < 0) 7588 return ret; 7589 } 7590 7591 root_level = btrfs_header_level(sctx->parent_root->commit_root); 7592 if (root_level > 0) { 7593 ret = replace_node_with_clone(right_path, root_level); 7594 if (ret < 0) 7595 return ret; 7596 } 7597 7598 return 0; 7599 } 7600 7601 /* 7602 * This function compares two trees and calls the provided callback for 7603 * every changed/new/deleted item it finds. 7604 * If shared tree blocks are encountered, whole subtrees are skipped, making 7605 * the compare pretty fast on snapshotted subvolumes. 7606 * 7607 * This currently works on commit roots only. As commit roots are read only, 7608 * we don't do any locking. The commit roots are protected with transactions. 7609 * Transactions are ended and rejoined when a commit is tried in between. 7610 * 7611 * This function checks for modifications done to the trees while comparing. 7612 * If it detects a change, it aborts immediately. 7613 */ 7614 static int btrfs_compare_trees(struct btrfs_root *left_root, 7615 struct btrfs_root *right_root, struct send_ctx *sctx) 7616 { 7617 struct btrfs_fs_info *fs_info = left_root->fs_info; 7618 int ret; 7619 int cmp; 7620 struct btrfs_path *left_path = NULL; 7621 struct btrfs_path *right_path = NULL; 7622 struct btrfs_key left_key; 7623 struct btrfs_key right_key; 7624 char *tmp_buf = NULL; 7625 int left_root_level; 7626 int right_root_level; 7627 int left_level; 7628 int right_level; 7629 int left_end_reached = 0; 7630 int right_end_reached = 0; 7631 int advance_left = 0; 7632 int advance_right = 0; 7633 u64 left_blockptr; 7634 u64 right_blockptr; 7635 u64 left_gen; 7636 u64 right_gen; 7637 u64 reada_min_gen; 7638 7639 left_path = btrfs_alloc_path(); 7640 if (!left_path) { 7641 ret = -ENOMEM; 7642 goto out; 7643 } 7644 right_path = btrfs_alloc_path(); 7645 if (!right_path) { 7646 ret = -ENOMEM; 7647 goto out; 7648 } 7649 7650 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 7651 if (!tmp_buf) { 7652 ret = -ENOMEM; 7653 goto out; 7654 } 7655 7656 left_path->search_commit_root = 1; 7657 left_path->skip_locking = 1; 7658 right_path->search_commit_root = 1; 7659 right_path->skip_locking = 1; 7660 7661 /* 7662 * Strategy: Go to the first items of both trees. Then do 7663 * 7664 * If both trees are at level 0 7665 * Compare keys of current items 7666 * If left < right treat left item as new, advance left tree 7667 * and repeat 7668 * If left > right treat right item as deleted, advance right tree 7669 * and repeat 7670 * If left == right do deep compare of items, treat as changed if 7671 * needed, advance both trees and repeat 7672 * If both trees are at the same level but not at level 0 7673 * Compare keys of current nodes/leafs 7674 * If left < right advance left tree and repeat 7675 * If left > right advance right tree and repeat 7676 * If left == right compare blockptrs of the next nodes/leafs 7677 * If they match advance both trees but stay at the same level 7678 * and repeat 7679 * If they don't match advance both trees while allowing to go 7680 * deeper and repeat 7681 * If tree levels are different 7682 * Advance the tree that needs it and repeat 7683 * 7684 * Advancing a tree means: 7685 * If we are at level 0, try to go to the next slot. If that's not 7686 * possible, go one level up and repeat. Stop when we found a level 7687 * where we could go to the next slot. We may at this point be on a 7688 * node or a leaf. 7689 * 7690 * If we are not at level 0 and not on shared tree blocks, go one 7691 * level deeper. 7692 * 7693 * If we are not at level 0 and on shared tree blocks, go one slot to 7694 * the right if possible or go up and right. 7695 */ 7696 7697 down_read(&fs_info->commit_root_sem); 7698 left_level = btrfs_header_level(left_root->commit_root); 7699 left_root_level = left_level; 7700 /* 7701 * We clone the root node of the send and parent roots to prevent races 7702 * with snapshot creation of these roots. Snapshot creation COWs the 7703 * root node of a tree, so after the transaction is committed the old 7704 * extent can be reallocated while this send operation is still ongoing. 7705 * So we clone them, under the commit root semaphore, to be race free. 7706 */ 7707 left_path->nodes[left_level] = 7708 btrfs_clone_extent_buffer(left_root->commit_root); 7709 if (!left_path->nodes[left_level]) { 7710 ret = -ENOMEM; 7711 goto out_unlock; 7712 } 7713 7714 right_level = btrfs_header_level(right_root->commit_root); 7715 right_root_level = right_level; 7716 right_path->nodes[right_level] = 7717 btrfs_clone_extent_buffer(right_root->commit_root); 7718 if (!right_path->nodes[right_level]) { 7719 ret = -ENOMEM; 7720 goto out_unlock; 7721 } 7722 /* 7723 * Our right root is the parent root, while the left root is the "send" 7724 * root. We know that all new nodes/leaves in the left root must have 7725 * a generation greater than the right root's generation, so we trigger 7726 * readahead for those nodes and leaves of the left root, as we know we 7727 * will need to read them at some point. 7728 */ 7729 reada_min_gen = btrfs_header_generation(right_root->commit_root); 7730 7731 if (left_level == 0) 7732 btrfs_item_key_to_cpu(left_path->nodes[left_level], 7733 &left_key, left_path->slots[left_level]); 7734 else 7735 btrfs_node_key_to_cpu(left_path->nodes[left_level], 7736 &left_key, left_path->slots[left_level]); 7737 if (right_level == 0) 7738 btrfs_item_key_to_cpu(right_path->nodes[right_level], 7739 &right_key, right_path->slots[right_level]); 7740 else 7741 btrfs_node_key_to_cpu(right_path->nodes[right_level], 7742 &right_key, right_path->slots[right_level]); 7743 7744 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7745 7746 while (1) { 7747 if (need_resched() || 7748 rwsem_is_contended(&fs_info->commit_root_sem)) { 7749 up_read(&fs_info->commit_root_sem); 7750 cond_resched(); 7751 down_read(&fs_info->commit_root_sem); 7752 } 7753 7754 if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { 7755 ret = restart_after_relocation(left_path, right_path, 7756 &left_key, &right_key, 7757 left_level, right_level, 7758 sctx); 7759 if (ret < 0) 7760 goto out_unlock; 7761 sctx->last_reloc_trans = fs_info->last_reloc_trans; 7762 } 7763 7764 if (advance_left && !left_end_reached) { 7765 ret = tree_advance(left_path, &left_level, 7766 left_root_level, 7767 advance_left != ADVANCE_ONLY_NEXT, 7768 &left_key, reada_min_gen); 7769 if (ret == -1) 7770 left_end_reached = ADVANCE; 7771 else if (ret < 0) 7772 goto out_unlock; 7773 advance_left = 0; 7774 } 7775 if (advance_right && !right_end_reached) { 7776 ret = tree_advance(right_path, &right_level, 7777 right_root_level, 7778 advance_right != ADVANCE_ONLY_NEXT, 7779 &right_key, reada_min_gen); 7780 if (ret == -1) 7781 right_end_reached = ADVANCE; 7782 else if (ret < 0) 7783 goto out_unlock; 7784 advance_right = 0; 7785 } 7786 7787 if (left_end_reached && right_end_reached) { 7788 ret = 0; 7789 goto out_unlock; 7790 } else if (left_end_reached) { 7791 if (right_level == 0) { 7792 up_read(&fs_info->commit_root_sem); 7793 ret = changed_cb(left_path, right_path, 7794 &right_key, 7795 BTRFS_COMPARE_TREE_DELETED, 7796 sctx); 7797 if (ret < 0) 7798 goto out; 7799 down_read(&fs_info->commit_root_sem); 7800 } 7801 advance_right = ADVANCE; 7802 continue; 7803 } else if (right_end_reached) { 7804 if (left_level == 0) { 7805 up_read(&fs_info->commit_root_sem); 7806 ret = changed_cb(left_path, right_path, 7807 &left_key, 7808 BTRFS_COMPARE_TREE_NEW, 7809 sctx); 7810 if (ret < 0) 7811 goto out; 7812 down_read(&fs_info->commit_root_sem); 7813 } 7814 advance_left = ADVANCE; 7815 continue; 7816 } 7817 7818 if (left_level == 0 && right_level == 0) { 7819 up_read(&fs_info->commit_root_sem); 7820 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7821 if (cmp < 0) { 7822 ret = changed_cb(left_path, right_path, 7823 &left_key, 7824 BTRFS_COMPARE_TREE_NEW, 7825 sctx); 7826 advance_left = ADVANCE; 7827 } else if (cmp > 0) { 7828 ret = changed_cb(left_path, right_path, 7829 &right_key, 7830 BTRFS_COMPARE_TREE_DELETED, 7831 sctx); 7832 advance_right = ADVANCE; 7833 } else { 7834 enum btrfs_compare_tree_result result; 7835 7836 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 7837 ret = tree_compare_item(left_path, right_path, 7838 tmp_buf); 7839 if (ret) 7840 result = BTRFS_COMPARE_TREE_CHANGED; 7841 else 7842 result = BTRFS_COMPARE_TREE_SAME; 7843 ret = changed_cb(left_path, right_path, 7844 &left_key, result, sctx); 7845 advance_left = ADVANCE; 7846 advance_right = ADVANCE; 7847 } 7848 7849 if (ret < 0) 7850 goto out; 7851 down_read(&fs_info->commit_root_sem); 7852 } else if (left_level == right_level) { 7853 cmp = btrfs_comp_cpu_keys(&left_key, &right_key); 7854 if (cmp < 0) { 7855 advance_left = ADVANCE; 7856 } else if (cmp > 0) { 7857 advance_right = ADVANCE; 7858 } else { 7859 left_blockptr = btrfs_node_blockptr( 7860 left_path->nodes[left_level], 7861 left_path->slots[left_level]); 7862 right_blockptr = btrfs_node_blockptr( 7863 right_path->nodes[right_level], 7864 right_path->slots[right_level]); 7865 left_gen = btrfs_node_ptr_generation( 7866 left_path->nodes[left_level], 7867 left_path->slots[left_level]); 7868 right_gen = btrfs_node_ptr_generation( 7869 right_path->nodes[right_level], 7870 right_path->slots[right_level]); 7871 if (left_blockptr == right_blockptr && 7872 left_gen == right_gen) { 7873 /* 7874 * As we're on a shared block, don't 7875 * allow to go deeper. 7876 */ 7877 advance_left = ADVANCE_ONLY_NEXT; 7878 advance_right = ADVANCE_ONLY_NEXT; 7879 } else { 7880 advance_left = ADVANCE; 7881 advance_right = ADVANCE; 7882 } 7883 } 7884 } else if (left_level < right_level) { 7885 advance_right = ADVANCE; 7886 } else { 7887 advance_left = ADVANCE; 7888 } 7889 } 7890 7891 out_unlock: 7892 up_read(&fs_info->commit_root_sem); 7893 out: 7894 btrfs_free_path(left_path); 7895 btrfs_free_path(right_path); 7896 kvfree(tmp_buf); 7897 return ret; 7898 } 7899 7900 static int send_subvol(struct send_ctx *sctx) 7901 { 7902 int ret; 7903 7904 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { 7905 ret = send_header(sctx); 7906 if (ret < 0) 7907 goto out; 7908 } 7909 7910 ret = send_subvol_begin(sctx); 7911 if (ret < 0) 7912 goto out; 7913 7914 if (sctx->parent_root) { 7915 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); 7916 if (ret < 0) 7917 goto out; 7918 ret = finish_inode_if_needed(sctx, 1); 7919 if (ret < 0) 7920 goto out; 7921 } else { 7922 ret = full_send_tree(sctx); 7923 if (ret < 0) 7924 goto out; 7925 } 7926 7927 out: 7928 free_recorded_refs(sctx); 7929 return ret; 7930 } 7931 7932 /* 7933 * If orphan cleanup did remove any orphans from a root, it means the tree 7934 * was modified and therefore the commit root is not the same as the current 7935 * root anymore. This is a problem, because send uses the commit root and 7936 * therefore can see inode items that don't exist in the current root anymore, 7937 * and for example make calls to btrfs_iget, which will do tree lookups based 7938 * on the current root and not on the commit root. Those lookups will fail, 7939 * returning a -ESTALE error, and making send fail with that error. So make 7940 * sure a send does not see any orphans we have just removed, and that it will 7941 * see the same inodes regardless of whether a transaction commit happened 7942 * before it started (meaning that the commit root will be the same as the 7943 * current root) or not. 7944 */ 7945 static int ensure_commit_roots_uptodate(struct send_ctx *sctx) 7946 { 7947 struct btrfs_root *root = sctx->parent_root; 7948 7949 if (root && root->node != root->commit_root) 7950 return btrfs_commit_current_transaction(root); 7951 7952 for (int i = 0; i < sctx->clone_roots_cnt; i++) { 7953 root = sctx->clone_roots[i].root; 7954 if (root->node != root->commit_root) 7955 return btrfs_commit_current_transaction(root); 7956 } 7957 7958 return 0; 7959 } 7960 7961 /* 7962 * Make sure any existing dellaloc is flushed for any root used by a send 7963 * operation so that we do not miss any data and we do not race with writeback 7964 * finishing and changing a tree while send is using the tree. This could 7965 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and 7966 * a send operation then uses the subvolume. 7967 * After flushing delalloc ensure_commit_roots_uptodate() must be called. 7968 */ 7969 static int flush_delalloc_roots(struct send_ctx *sctx) 7970 { 7971 struct btrfs_root *root = sctx->parent_root; 7972 int ret; 7973 int i; 7974 7975 if (root) { 7976 ret = btrfs_start_delalloc_snapshot(root, false); 7977 if (ret) 7978 return ret; 7979 btrfs_wait_ordered_extents(root, U64_MAX, NULL); 7980 } 7981 7982 for (i = 0; i < sctx->clone_roots_cnt; i++) { 7983 root = sctx->clone_roots[i].root; 7984 ret = btrfs_start_delalloc_snapshot(root, false); 7985 if (ret) 7986 return ret; 7987 btrfs_wait_ordered_extents(root, U64_MAX, NULL); 7988 } 7989 7990 return 0; 7991 } 7992 7993 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) 7994 { 7995 spin_lock(&root->root_item_lock); 7996 root->send_in_progress--; 7997 /* 7998 * Not much left to do, we don't know why it's unbalanced and 7999 * can't blindly reset it to 0. 8000 */ 8001 if (root->send_in_progress < 0) 8002 btrfs_err(root->fs_info, 8003 "send_in_progress unbalanced %d root %llu", 8004 root->send_in_progress, btrfs_root_id(root)); 8005 spin_unlock(&root->root_item_lock); 8006 } 8007 8008 static void dedupe_in_progress_warn(const struct btrfs_root *root) 8009 { 8010 btrfs_warn_rl(root->fs_info, 8011 "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", 8012 btrfs_root_id(root), root->dedupe_in_progress); 8013 } 8014 8015 long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) 8016 { 8017 int ret = 0; 8018 struct btrfs_fs_info *fs_info = send_root->fs_info; 8019 struct btrfs_root *clone_root; 8020 struct send_ctx *sctx = NULL; 8021 u32 i; 8022 u64 *clone_sources_tmp = NULL; 8023 int clone_sources_to_rollback = 0; 8024 size_t alloc_size; 8025 int sort_clone_roots = 0; 8026 struct btrfs_lru_cache_entry *entry; 8027 struct btrfs_lru_cache_entry *tmp; 8028 8029 if (!capable(CAP_SYS_ADMIN)) 8030 return -EPERM; 8031 8032 /* 8033 * The subvolume must remain read-only during send, protect against 8034 * making it RW. This also protects against deletion. 8035 */ 8036 spin_lock(&send_root->root_item_lock); 8037 /* 8038 * Unlikely but possible, if the subvolume is marked for deletion but 8039 * is slow to remove the directory entry, send can still be started. 8040 */ 8041 if (btrfs_root_dead(send_root)) { 8042 spin_unlock(&send_root->root_item_lock); 8043 return -EPERM; 8044 } 8045 /* Userspace tools do the checks and warn the user if it's not RO. */ 8046 if (!btrfs_root_readonly(send_root)) { 8047 spin_unlock(&send_root->root_item_lock); 8048 return -EPERM; 8049 } 8050 if (send_root->dedupe_in_progress) { 8051 dedupe_in_progress_warn(send_root); 8052 spin_unlock(&send_root->root_item_lock); 8053 return -EAGAIN; 8054 } 8055 send_root->send_in_progress++; 8056 spin_unlock(&send_root->root_item_lock); 8057 8058 /* 8059 * Check that we don't overflow at later allocations, we request 8060 * clone_sources_count + 1 items, and compare to unsigned long inside 8061 * access_ok. Also set an upper limit for allocation size so this can't 8062 * easily exhaust memory. Max number of clone sources is about 200K. 8063 */ 8064 if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) { 8065 ret = -EINVAL; 8066 goto out; 8067 } 8068 8069 if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { 8070 ret = -EOPNOTSUPP; 8071 goto out; 8072 } 8073 8074 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL); 8075 if (!sctx) { 8076 ret = -ENOMEM; 8077 goto out; 8078 } 8079 8080 init_path(&sctx->cur_inode_path); 8081 INIT_LIST_HEAD(&sctx->new_refs); 8082 INIT_LIST_HEAD(&sctx->deleted_refs); 8083 8084 btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE); 8085 btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE); 8086 btrfs_lru_cache_init(&sctx->dir_created_cache, 8087 SEND_MAX_DIR_CREATED_CACHE_SIZE); 8088 /* 8089 * This cache is periodically trimmed to a fixed size elsewhere, see 8090 * cache_dir_utimes() and trim_dir_utimes_cache(). 8091 */ 8092 btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0); 8093 8094 sctx->pending_dir_moves = RB_ROOT; 8095 sctx->waiting_dir_moves = RB_ROOT; 8096 sctx->orphan_dirs = RB_ROOT; 8097 sctx->rbtree_new_refs = RB_ROOT; 8098 sctx->rbtree_deleted_refs = RB_ROOT; 8099 8100 sctx->flags = arg->flags; 8101 8102 if (arg->flags & BTRFS_SEND_FLAG_VERSION) { 8103 if (arg->version > BTRFS_SEND_STREAM_VERSION) { 8104 ret = -EPROTO; 8105 goto out; 8106 } 8107 /* Zero means "use the highest version" */ 8108 sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; 8109 } else { 8110 sctx->proto = 1; 8111 } 8112 if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { 8113 ret = -EINVAL; 8114 goto out; 8115 } 8116 8117 sctx->send_filp = fget(arg->send_fd); 8118 if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { 8119 ret = -EBADF; 8120 goto out; 8121 } 8122 8123 sctx->send_root = send_root; 8124 sctx->clone_roots_cnt = arg->clone_sources_count; 8125 8126 if (sctx->proto >= 2) { 8127 u32 send_buf_num_pages; 8128 8129 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; 8130 sctx->send_buf = vmalloc(sctx->send_max_size); 8131 if (!sctx->send_buf) { 8132 ret = -ENOMEM; 8133 goto out; 8134 } 8135 send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; 8136 sctx->send_buf_pages = kcalloc(send_buf_num_pages, 8137 sizeof(*sctx->send_buf_pages), 8138 GFP_KERNEL); 8139 if (!sctx->send_buf_pages) { 8140 ret = -ENOMEM; 8141 goto out; 8142 } 8143 for (i = 0; i < send_buf_num_pages; i++) { 8144 sctx->send_buf_pages[i] = 8145 vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); 8146 } 8147 } else { 8148 sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; 8149 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); 8150 } 8151 if (!sctx->send_buf) { 8152 ret = -ENOMEM; 8153 goto out; 8154 } 8155 8156 sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1, 8157 sizeof(*sctx->clone_roots), 8158 GFP_KERNEL); 8159 if (!sctx->clone_roots) { 8160 ret = -ENOMEM; 8161 goto out; 8162 } 8163 8164 alloc_size = array_size(sizeof(*arg->clone_sources), 8165 arg->clone_sources_count); 8166 8167 if (arg->clone_sources_count) { 8168 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); 8169 if (!clone_sources_tmp) { 8170 ret = -ENOMEM; 8171 goto out; 8172 } 8173 8174 ret = copy_from_user(clone_sources_tmp, arg->clone_sources, 8175 alloc_size); 8176 if (ret) { 8177 ret = -EFAULT; 8178 goto out; 8179 } 8180 8181 for (i = 0; i < arg->clone_sources_count; i++) { 8182 clone_root = btrfs_get_fs_root(fs_info, 8183 clone_sources_tmp[i], true); 8184 if (IS_ERR(clone_root)) { 8185 ret = PTR_ERR(clone_root); 8186 goto out; 8187 } 8188 spin_lock(&clone_root->root_item_lock); 8189 if (!btrfs_root_readonly(clone_root) || 8190 btrfs_root_dead(clone_root)) { 8191 spin_unlock(&clone_root->root_item_lock); 8192 btrfs_put_root(clone_root); 8193 ret = -EPERM; 8194 goto out; 8195 } 8196 if (clone_root->dedupe_in_progress) { 8197 dedupe_in_progress_warn(clone_root); 8198 spin_unlock(&clone_root->root_item_lock); 8199 btrfs_put_root(clone_root); 8200 ret = -EAGAIN; 8201 goto out; 8202 } 8203 clone_root->send_in_progress++; 8204 spin_unlock(&clone_root->root_item_lock); 8205 8206 sctx->clone_roots[i].root = clone_root; 8207 clone_sources_to_rollback = i + 1; 8208 } 8209 kvfree(clone_sources_tmp); 8210 clone_sources_tmp = NULL; 8211 } 8212 8213 if (arg->parent_root) { 8214 sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, 8215 true); 8216 if (IS_ERR(sctx->parent_root)) { 8217 ret = PTR_ERR(sctx->parent_root); 8218 goto out; 8219 } 8220 8221 spin_lock(&sctx->parent_root->root_item_lock); 8222 sctx->parent_root->send_in_progress++; 8223 if (!btrfs_root_readonly(sctx->parent_root) || 8224 btrfs_root_dead(sctx->parent_root)) { 8225 spin_unlock(&sctx->parent_root->root_item_lock); 8226 ret = -EPERM; 8227 goto out; 8228 } 8229 if (sctx->parent_root->dedupe_in_progress) { 8230 dedupe_in_progress_warn(sctx->parent_root); 8231 spin_unlock(&sctx->parent_root->root_item_lock); 8232 ret = -EAGAIN; 8233 goto out; 8234 } 8235 spin_unlock(&sctx->parent_root->root_item_lock); 8236 } 8237 8238 /* 8239 * Clones from send_root are allowed, but only if the clone source 8240 * is behind the current send position. This is checked while searching 8241 * for possible clone sources. 8242 */ 8243 sctx->clone_roots[sctx->clone_roots_cnt++].root = 8244 btrfs_grab_root(sctx->send_root); 8245 8246 /* We do a bsearch later */ 8247 sort(sctx->clone_roots, sctx->clone_roots_cnt, 8248 sizeof(*sctx->clone_roots), __clone_root_cmp_sort, 8249 NULL); 8250 sort_clone_roots = 1; 8251 8252 ret = flush_delalloc_roots(sctx); 8253 if (ret) 8254 goto out; 8255 8256 ret = ensure_commit_roots_uptodate(sctx); 8257 if (ret) 8258 goto out; 8259 8260 ret = send_subvol(sctx); 8261 if (ret < 0) 8262 goto out; 8263 8264 btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) { 8265 ret = send_utimes(sctx, entry->key, entry->gen); 8266 if (ret < 0) 8267 goto out; 8268 btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry); 8269 } 8270 8271 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { 8272 ret = begin_cmd(sctx, BTRFS_SEND_C_END); 8273 if (ret < 0) 8274 goto out; 8275 ret = send_cmd(sctx); 8276 if (ret < 0) 8277 goto out; 8278 } 8279 8280 out: 8281 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)); 8282 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) { 8283 struct rb_node *n; 8284 struct pending_dir_move *pm; 8285 8286 n = rb_first(&sctx->pending_dir_moves); 8287 pm = rb_entry(n, struct pending_dir_move, node); 8288 while (!list_empty(&pm->list)) { 8289 struct pending_dir_move *pm2; 8290 8291 pm2 = list_first_entry(&pm->list, 8292 struct pending_dir_move, list); 8293 free_pending_move(sctx, pm2); 8294 } 8295 free_pending_move(sctx, pm); 8296 } 8297 8298 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)); 8299 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) { 8300 struct rb_node *n; 8301 struct waiting_dir_move *dm; 8302 8303 n = rb_first(&sctx->waiting_dir_moves); 8304 dm = rb_entry(n, struct waiting_dir_move, node); 8305 rb_erase(&dm->node, &sctx->waiting_dir_moves); 8306 kfree(dm); 8307 } 8308 8309 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs)); 8310 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) { 8311 struct rb_node *n; 8312 struct orphan_dir_info *odi; 8313 8314 n = rb_first(&sctx->orphan_dirs); 8315 odi = rb_entry(n, struct orphan_dir_info, node); 8316 free_orphan_dir_info(sctx, odi); 8317 } 8318 8319 if (sort_clone_roots) { 8320 for (i = 0; i < sctx->clone_roots_cnt; i++) { 8321 btrfs_root_dec_send_in_progress( 8322 sctx->clone_roots[i].root); 8323 btrfs_put_root(sctx->clone_roots[i].root); 8324 } 8325 } else { 8326 for (i = 0; sctx && i < clone_sources_to_rollback; i++) { 8327 btrfs_root_dec_send_in_progress( 8328 sctx->clone_roots[i].root); 8329 btrfs_put_root(sctx->clone_roots[i].root); 8330 } 8331 8332 btrfs_root_dec_send_in_progress(send_root); 8333 } 8334 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) { 8335 btrfs_root_dec_send_in_progress(sctx->parent_root); 8336 btrfs_put_root(sctx->parent_root); 8337 } 8338 8339 kvfree(clone_sources_tmp); 8340 8341 if (sctx) { 8342 if (sctx->send_filp) 8343 fput(sctx->send_filp); 8344 8345 kvfree(sctx->clone_roots); 8346 kfree(sctx->send_buf_pages); 8347 kvfree(sctx->send_buf); 8348 kvfree(sctx->verity_descriptor); 8349 8350 close_current_inode(sctx); 8351 8352 btrfs_lru_cache_clear(&sctx->name_cache); 8353 btrfs_lru_cache_clear(&sctx->backref_cache); 8354 btrfs_lru_cache_clear(&sctx->dir_created_cache); 8355 btrfs_lru_cache_clear(&sctx->dir_utimes_cache); 8356 8357 if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) 8358 kfree(sctx->cur_inode_path.buf); 8359 8360 kfree(sctx); 8361 } 8362 8363 return ret; 8364 } 8365