1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "messages.h" 11 #include "misc.h" 12 #include "ctree.h" 13 #include "transaction.h" 14 #include "btrfs_inode.h" 15 #include "extent_io.h" 16 #include "disk-io.h" 17 #include "compression.h" 18 #include "delalloc-space.h" 19 #include "qgroup.h" 20 #include "subpage.h" 21 #include "file.h" 22 #include "block-group.h" 23 24 static struct kmem_cache *btrfs_ordered_extent_cache; 25 26 static u64 entry_end(struct btrfs_ordered_extent *entry) 27 { 28 if (entry->file_offset + entry->num_bytes < entry->file_offset) 29 return (u64)-1; 30 return entry->file_offset + entry->num_bytes; 31 } 32 33 /* returns NULL if the insertion worked, or it returns the node it did find 34 * in the tree 35 */ 36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 37 struct rb_node *node) 38 { 39 struct rb_node **p = &root->rb_node; 40 struct rb_node *parent = NULL; 41 struct btrfs_ordered_extent *entry; 42 43 while (*p) { 44 parent = *p; 45 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 46 47 if (file_offset < entry->file_offset) 48 p = &(*p)->rb_left; 49 else if (file_offset >= entry_end(entry)) 50 p = &(*p)->rb_right; 51 else 52 return parent; 53 } 54 55 rb_link_node(node, parent, p); 56 rb_insert_color(node, root); 57 return NULL; 58 } 59 60 /* 61 * look for a given offset in the tree, and if it can't be found return the 62 * first lesser offset 63 */ 64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 65 struct rb_node **prev_ret) 66 { 67 struct rb_node *n = root->rb_node; 68 struct rb_node *prev = NULL; 69 struct rb_node *test; 70 struct btrfs_ordered_extent *entry; 71 struct btrfs_ordered_extent *prev_entry = NULL; 72 73 while (n) { 74 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 75 prev = n; 76 prev_entry = entry; 77 78 if (file_offset < entry->file_offset) 79 n = n->rb_left; 80 else if (file_offset >= entry_end(entry)) 81 n = n->rb_right; 82 else 83 return n; 84 } 85 if (!prev_ret) 86 return NULL; 87 88 while (prev && file_offset >= entry_end(prev_entry)) { 89 test = rb_next(prev); 90 if (!test) 91 break; 92 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 93 rb_node); 94 if (file_offset < entry_end(prev_entry)) 95 break; 96 97 prev = test; 98 } 99 if (prev) 100 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 101 rb_node); 102 while (prev && file_offset < entry_end(prev_entry)) { 103 test = rb_prev(prev); 104 if (!test) 105 break; 106 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 107 rb_node); 108 prev = test; 109 } 110 *prev_ret = prev; 111 return NULL; 112 } 113 114 static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 115 u64 len) 116 { 117 if (file_offset + len <= entry->file_offset || 118 entry->file_offset + entry->num_bytes <= file_offset) 119 return 0; 120 return 1; 121 } 122 123 /* 124 * look find the first ordered struct that has this offset, otherwise 125 * the first one less than this offset 126 */ 127 static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, 128 u64 file_offset) 129 { 130 struct rb_node *prev = NULL; 131 struct rb_node *ret; 132 struct btrfs_ordered_extent *entry; 133 134 if (inode->ordered_tree_last) { 135 entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, 136 rb_node); 137 if (in_range(file_offset, entry->file_offset, entry->num_bytes)) 138 return inode->ordered_tree_last; 139 } 140 ret = __tree_search(&inode->ordered_tree, file_offset, &prev); 141 if (!ret) 142 ret = prev; 143 if (ret) 144 inode->ordered_tree_last = ret; 145 return ret; 146 } 147 148 static struct btrfs_ordered_extent *alloc_ordered_extent( 149 struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, 150 u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, 151 u64 offset, unsigned long flags, int compress_type) 152 { 153 struct btrfs_ordered_extent *entry; 154 int ret; 155 u64 qgroup_rsv = 0; 156 const bool is_nocow = (flags & 157 ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); 158 159 /* 160 * For a NOCOW write we can free the qgroup reserve right now. For a COW 161 * one we transfer the reserved space from the inode's iotree into the 162 * ordered extent by calling btrfs_qgroup_release_data() and tracking 163 * the qgroup reserved amount in the ordered extent, so that later after 164 * completing the ordered extent, when running the data delayed ref it 165 * creates, we free the reserved data with btrfs_qgroup_free_refroot(). 166 */ 167 if (is_nocow) 168 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); 169 else 170 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); 171 172 if (ret < 0) 173 return ERR_PTR(ret); 174 175 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 176 if (!entry) { 177 entry = ERR_PTR(-ENOMEM); 178 goto out; 179 } 180 181 entry->file_offset = file_offset; 182 entry->num_bytes = num_bytes; 183 entry->ram_bytes = ram_bytes; 184 entry->disk_bytenr = disk_bytenr; 185 entry->disk_num_bytes = disk_num_bytes; 186 entry->offset = offset; 187 entry->bytes_left = num_bytes; 188 if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { 189 kmem_cache_free(btrfs_ordered_extent_cache, entry); 190 entry = ERR_PTR(-ESTALE); 191 goto out; 192 } 193 entry->inode = inode; 194 entry->compress_type = compress_type; 195 entry->truncated_len = (u64)-1; 196 entry->qgroup_rsv = qgroup_rsv; 197 entry->flags = flags; 198 refcount_set(&entry->refs, 1); 199 init_waitqueue_head(&entry->wait); 200 INIT_LIST_HEAD(&entry->list); 201 INIT_LIST_HEAD(&entry->log_list); 202 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->work_list); 204 INIT_LIST_HEAD(&entry->bioc_list); 205 init_completion(&entry->completion); 206 207 /* 208 * We don't need the count_max_extents here, we can assume that all of 209 * that work has been done at higher layers, so this is truly the 210 * smallest the extent is going to get. 211 */ 212 spin_lock(&inode->lock); 213 btrfs_mod_outstanding_extents(inode, 1); 214 spin_unlock(&inode->lock); 215 216 out: 217 if (IS_ERR(entry) && !is_nocow) 218 btrfs_qgroup_free_refroot(inode->root->fs_info, 219 btrfs_root_id(inode->root), 220 qgroup_rsv, BTRFS_QGROUP_RSV_DATA); 221 222 return entry; 223 } 224 225 static void insert_ordered_extent(struct btrfs_ordered_extent *entry) 226 { 227 struct btrfs_inode *inode = entry->inode; 228 struct btrfs_root *root = inode->root; 229 struct btrfs_fs_info *fs_info = root->fs_info; 230 struct rb_node *node; 231 232 trace_btrfs_ordered_extent_add(inode, entry); 233 234 percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, 235 fs_info->delalloc_batch); 236 237 /* One ref for the tree. */ 238 refcount_inc(&entry->refs); 239 240 spin_lock_irq(&inode->ordered_tree_lock); 241 node = tree_insert(&inode->ordered_tree, entry->file_offset, 242 &entry->rb_node); 243 if (unlikely(node)) 244 btrfs_panic(fs_info, -EEXIST, 245 "inconsistency in ordered tree at offset %llu", 246 entry->file_offset); 247 spin_unlock_irq(&inode->ordered_tree_lock); 248 249 spin_lock(&root->ordered_extent_lock); 250 list_add_tail(&entry->root_extent_list, 251 &root->ordered_extents); 252 root->nr_ordered_extents++; 253 if (root->nr_ordered_extents == 1) { 254 spin_lock(&fs_info->ordered_root_lock); 255 BUG_ON(!list_empty(&root->ordered_root)); 256 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 257 spin_unlock(&fs_info->ordered_root_lock); 258 } 259 spin_unlock(&root->ordered_extent_lock); 260 } 261 262 /* 263 * Add an ordered extent to the per-inode tree. 264 * 265 * @inode: Inode that this extent is for. 266 * @file_offset: Logical offset in file where the extent starts. 267 * @num_bytes: Logical length of extent in file. 268 * @ram_bytes: Full length of unencoded data. 269 * @disk_bytenr: Offset of extent on disk. 270 * @disk_num_bytes: Size of extent on disk. 271 * @offset: Offset into unencoded data where file data starts. 272 * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). 273 * @compress_type: Compression algorithm used for data. 274 * 275 * Most of these parameters correspond to &struct btrfs_file_extent_item. The 276 * tree is given a single reference on the ordered extent that was inserted, and 277 * the returned pointer is given a second reference. 278 * 279 * Return: the new ordered extent or error pointer. 280 */ 281 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( 282 struct btrfs_inode *inode, u64 file_offset, 283 const struct btrfs_file_extent *file_extent, unsigned long flags) 284 { 285 struct btrfs_ordered_extent *entry; 286 287 ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); 288 289 /* 290 * For regular writes, we just use the members in @file_extent. 291 * 292 * For NOCOW, we don't really care about the numbers except @start and 293 * file_extent->num_bytes, as we won't insert a file extent item at all. 294 * 295 * For PREALLOC, we do not use ordered extent members, but 296 * btrfs_mark_extent_written() handles everything. 297 * 298 * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents, 299 * or btrfs_split_ordered_extent() cannot handle it correctly. 300 */ 301 if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))) 302 entry = alloc_ordered_extent(inode, file_offset, 303 file_extent->num_bytes, 304 file_extent->num_bytes, 305 file_extent->disk_bytenr + file_extent->offset, 306 file_extent->num_bytes, 0, flags, 307 file_extent->compression); 308 else 309 entry = alloc_ordered_extent(inode, file_offset, 310 file_extent->num_bytes, 311 file_extent->ram_bytes, 312 file_extent->disk_bytenr, 313 file_extent->disk_num_bytes, 314 file_extent->offset, flags, 315 file_extent->compression); 316 if (!IS_ERR(entry)) 317 insert_ordered_extent(entry); 318 return entry; 319 } 320 321 /* 322 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 323 * when an ordered extent is finished. If the list covers more than one 324 * ordered extent, it is split across multiples. 325 */ 326 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 327 struct btrfs_ordered_sum *sum) 328 { 329 struct btrfs_inode *inode = entry->inode; 330 331 spin_lock_irq(&inode->ordered_tree_lock); 332 list_add_tail(&sum->list, &entry->list); 333 spin_unlock_irq(&inode->ordered_tree_lock); 334 } 335 336 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) 337 { 338 if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 339 mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO); 340 } 341 342 static void finish_ordered_fn(struct btrfs_work *work) 343 { 344 struct btrfs_ordered_extent *ordered_extent; 345 346 ordered_extent = container_of(work, struct btrfs_ordered_extent, work); 347 btrfs_finish_ordered_io(ordered_extent); 348 } 349 350 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 351 struct folio *folio, u64 file_offset, 352 u64 len, bool uptodate) 353 { 354 struct btrfs_inode *inode = ordered->inode; 355 struct btrfs_fs_info *fs_info = inode->root->fs_info; 356 357 lockdep_assert_held(&inode->ordered_tree_lock); 358 359 if (folio) { 360 ASSERT(folio->mapping); 361 ASSERT(folio_pos(folio) <= file_offset); 362 ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); 363 364 /* 365 * Ordered flag indicates whether we still have 366 * pending io unfinished for the ordered extent. 367 * 368 * If it's not set, we need to skip to next range. 369 */ 370 if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) 371 return false; 372 btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); 373 } 374 375 /* Now we're fine to update the accounting. */ 376 if (WARN_ON_ONCE(len > ordered->bytes_left)) { 377 btrfs_crit(fs_info, 378 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", 379 btrfs_root_id(inode->root), btrfs_ino(inode), 380 ordered->file_offset, ordered->num_bytes, 381 len, ordered->bytes_left); 382 ordered->bytes_left = 0; 383 } else { 384 ordered->bytes_left -= len; 385 } 386 387 if (!uptodate) 388 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 389 390 if (ordered->bytes_left) 391 return false; 392 393 /* 394 * All the IO of the ordered extent is finished, we need to queue 395 * the finish_func to be executed. 396 */ 397 set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); 398 cond_wake_up(&ordered->wait); 399 refcount_inc(&ordered->refs); 400 trace_btrfs_ordered_extent_mark_finished(inode, ordered); 401 return true; 402 } 403 404 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) 405 { 406 struct btrfs_inode *inode = ordered->inode; 407 struct btrfs_fs_info *fs_info = inode->root->fs_info; 408 struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? 409 fs_info->endio_freespace_worker : fs_info->endio_write_workers; 410 411 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); 412 btrfs_queue_work(wq, &ordered->work); 413 } 414 415 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 416 struct folio *folio, u64 file_offset, u64 len, 417 bool uptodate) 418 { 419 struct btrfs_inode *inode = ordered->inode; 420 unsigned long flags; 421 bool ret; 422 423 trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); 424 425 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 426 ret = can_finish_ordered_extent(ordered, folio, file_offset, len, 427 uptodate); 428 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 429 430 /* 431 * If this is a COW write it means we created new extent maps for the 432 * range and they point to unwritten locations if we got an error either 433 * before submitting a bio or during IO. 434 * 435 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we 436 * are queuing its completion below. During completion, at 437 * btrfs_finish_one_ordered(), we will drop the extent maps for the 438 * unwritten extents. 439 * 440 * However because completion runs in a work queue we can end up having 441 * a fast fsync running before that. In the case of direct IO, once we 442 * unlock the inode the fsync might start, and we queue the completion 443 * before unlocking the inode. In the case of buffered IO when writeback 444 * finishes (end_bbio_data_write()) we queue the completion, so if the 445 * writeback was triggered by a fast fsync, the fsync might start 446 * logging before ordered extent completion runs in the work queue. 447 * 448 * The fast fsync will log file extent items based on the extent maps it 449 * finds, so if by the time it collects extent maps the ordered extent 450 * completion didn't happen yet, it will log file extent items that 451 * point to unwritten extents, resulting in a corruption if a crash 452 * happens and the log tree is replayed. Note that a fast fsync does not 453 * wait for completion of ordered extents in order to reduce latency. 454 * 455 * Set a flag in the inode so that the next fast fsync will wait for 456 * ordered extents to complete before starting to log. 457 */ 458 if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 459 set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); 460 461 if (ret) 462 btrfs_queue_ordered_fn(ordered); 463 } 464 465 /* 466 * Mark all ordered extents io inside the specified range finished. 467 * 468 * @folio: The involved folio for the operation. 469 * For uncompressed buffered IO, the folio status also needs to be 470 * updated to indicate whether the pending ordered io is finished. 471 * Can be NULL for direct IO and compressed write. 472 * For these cases, callers are ensured they won't execute the 473 * endio function twice. 474 * 475 * This function is called for endio, thus the range must have ordered 476 * extent(s) covering it. 477 */ 478 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 479 struct folio *folio, u64 file_offset, 480 u64 num_bytes, bool uptodate) 481 { 482 struct rb_node *node; 483 struct btrfs_ordered_extent *entry = NULL; 484 unsigned long flags; 485 u64 cur = file_offset; 486 487 trace_btrfs_writepage_end_io_hook(inode, file_offset, 488 file_offset + num_bytes - 1, 489 uptodate); 490 491 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 492 while (cur < file_offset + num_bytes) { 493 u64 entry_end; 494 u64 end; 495 u32 len; 496 497 node = ordered_tree_search(inode, cur); 498 /* No ordered extents at all */ 499 if (!node) 500 break; 501 502 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 503 entry_end = entry->file_offset + entry->num_bytes; 504 /* 505 * |<-- OE --->| | 506 * cur 507 * Go to next OE. 508 */ 509 if (cur >= entry_end) { 510 node = rb_next(node); 511 /* No more ordered extents, exit */ 512 if (!node) 513 break; 514 entry = rb_entry(node, struct btrfs_ordered_extent, 515 rb_node); 516 517 /* Go to next ordered extent and continue */ 518 cur = entry->file_offset; 519 continue; 520 } 521 /* 522 * | |<--- OE --->| 523 * cur 524 * Go to the start of OE. 525 */ 526 if (cur < entry->file_offset) { 527 cur = entry->file_offset; 528 continue; 529 } 530 531 /* 532 * Now we are definitely inside one ordered extent. 533 * 534 * |<--- OE --->| 535 * | 536 * cur 537 */ 538 end = min(entry->file_offset + entry->num_bytes, 539 file_offset + num_bytes) - 1; 540 ASSERT(end + 1 - cur < U32_MAX); 541 len = end + 1 - cur; 542 543 if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { 544 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 545 btrfs_queue_ordered_fn(entry); 546 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 547 } 548 cur += len; 549 } 550 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 551 } 552 553 /* 554 * Finish IO for one ordered extent across a given range. The range can only 555 * contain one ordered extent. 556 * 557 * @cached: The cached ordered extent. If not NULL, we can skip the tree 558 * search and use the ordered extent directly. 559 * Will be also used to store the finished ordered extent. 560 * @file_offset: File offset for the finished IO 561 * @io_size: Length of the finish IO range 562 * 563 * Return true if the ordered extent is finished in the range, and update 564 * @cached. 565 * Return false otherwise. 566 * 567 * NOTE: The range can NOT cross multiple ordered extents. 568 * Thus caller should ensure the range doesn't cross ordered extents. 569 */ 570 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 571 struct btrfs_ordered_extent **cached, 572 u64 file_offset, u64 io_size) 573 { 574 struct rb_node *node; 575 struct btrfs_ordered_extent *entry = NULL; 576 unsigned long flags; 577 bool finished = false; 578 579 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 580 if (cached && *cached) { 581 entry = *cached; 582 goto have_entry; 583 } 584 585 node = ordered_tree_search(inode, file_offset); 586 if (!node) 587 goto out; 588 589 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 590 have_entry: 591 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 592 goto out; 593 594 if (io_size > entry->bytes_left) 595 btrfs_crit(inode->root->fs_info, 596 "bad ordered accounting left %llu size %llu", 597 entry->bytes_left, io_size); 598 599 entry->bytes_left -= io_size; 600 601 if (entry->bytes_left == 0) { 602 /* 603 * Ensure only one caller can set the flag and finished_ret 604 * accordingly 605 */ 606 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 607 /* test_and_set_bit implies a barrier */ 608 cond_wake_up_nomb(&entry->wait); 609 } 610 out: 611 if (finished && cached && entry) { 612 *cached = entry; 613 refcount_inc(&entry->refs); 614 trace_btrfs_ordered_extent_dec_test_pending(inode, entry); 615 } 616 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 617 return finished; 618 } 619 620 /* 621 * used to drop a reference on an ordered extent. This will free 622 * the extent if the last reference is dropped 623 */ 624 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 625 { 626 trace_btrfs_ordered_extent_put(entry->inode, entry); 627 628 if (refcount_dec_and_test(&entry->refs)) { 629 struct btrfs_ordered_sum *sum; 630 struct btrfs_ordered_sum *tmp; 631 632 ASSERT(list_empty(&entry->root_extent_list)); 633 ASSERT(list_empty(&entry->log_list)); 634 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 635 btrfs_add_delayed_iput(entry->inode); 636 list_for_each_entry_safe(sum, tmp, &entry->list, list) 637 kvfree(sum); 638 kmem_cache_free(btrfs_ordered_extent_cache, entry); 639 } 640 } 641 642 /* 643 * remove an ordered extent from the tree. No references are dropped 644 * and waiters are woken up. 645 */ 646 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, 647 struct btrfs_ordered_extent *entry) 648 { 649 struct btrfs_root *root = btrfs_inode->root; 650 struct btrfs_fs_info *fs_info = root->fs_info; 651 struct rb_node *node; 652 bool pending; 653 bool freespace_inode; 654 655 /* 656 * If this is a free space inode the thread has not acquired the ordered 657 * extents lockdep map. 658 */ 659 freespace_inode = btrfs_is_free_space_inode(btrfs_inode); 660 661 btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); 662 /* This is paired with alloc_ordered_extent(). */ 663 spin_lock(&btrfs_inode->lock); 664 btrfs_mod_outstanding_extents(btrfs_inode, -1); 665 spin_unlock(&btrfs_inode->lock); 666 if (root != fs_info->tree_root) { 667 u64 release; 668 669 if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags)) 670 release = entry->disk_num_bytes; 671 else 672 release = entry->num_bytes; 673 btrfs_delalloc_release_metadata(btrfs_inode, release, 674 test_bit(BTRFS_ORDERED_IOERR, 675 &entry->flags)); 676 } 677 678 percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, 679 fs_info->delalloc_batch); 680 681 spin_lock_irq(&btrfs_inode->ordered_tree_lock); 682 node = &entry->rb_node; 683 rb_erase(node, &btrfs_inode->ordered_tree); 684 RB_CLEAR_NODE(node); 685 if (btrfs_inode->ordered_tree_last == node) 686 btrfs_inode->ordered_tree_last = NULL; 687 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 688 pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); 689 spin_unlock_irq(&btrfs_inode->ordered_tree_lock); 690 691 /* 692 * The current running transaction is waiting on us, we need to let it 693 * know that we're complete and wake it up. 694 */ 695 if (pending) { 696 struct btrfs_transaction *trans; 697 698 /* 699 * The checks for trans are just a formality, it should be set, 700 * but if it isn't we don't want to deref/assert under the spin 701 * lock, so be nice and check if trans is set, but ASSERT() so 702 * if it isn't set a developer will notice. 703 */ 704 spin_lock(&fs_info->trans_lock); 705 trans = fs_info->running_transaction; 706 if (trans) 707 refcount_inc(&trans->use_count); 708 spin_unlock(&fs_info->trans_lock); 709 710 ASSERT(trans || BTRFS_FS_ERROR(fs_info)); 711 if (trans) { 712 if (atomic_dec_and_test(&trans->pending_ordered)) 713 wake_up(&trans->pending_wait); 714 btrfs_put_transaction(trans); 715 } 716 } 717 718 btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered); 719 720 spin_lock(&root->ordered_extent_lock); 721 list_del_init(&entry->root_extent_list); 722 root->nr_ordered_extents--; 723 724 trace_btrfs_ordered_extent_remove(btrfs_inode, entry); 725 726 if (!root->nr_ordered_extents) { 727 spin_lock(&fs_info->ordered_root_lock); 728 BUG_ON(list_empty(&root->ordered_root)); 729 list_del_init(&root->ordered_root); 730 spin_unlock(&fs_info->ordered_root_lock); 731 } 732 spin_unlock(&root->ordered_extent_lock); 733 wake_up(&entry->wait); 734 if (!freespace_inode) 735 btrfs_lockdep_release(fs_info, btrfs_ordered_extent); 736 } 737 738 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 739 { 740 struct btrfs_ordered_extent *ordered; 741 742 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 743 btrfs_start_ordered_extent(ordered); 744 complete(&ordered->completion); 745 } 746 747 /* 748 * Wait for all the ordered extents in a root. Use @bg as range or do whole 749 * range if it's NULL. 750 */ 751 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 752 const struct btrfs_block_group *bg) 753 { 754 struct btrfs_fs_info *fs_info = root->fs_info; 755 LIST_HEAD(splice); 756 LIST_HEAD(skipped); 757 LIST_HEAD(works); 758 struct btrfs_ordered_extent *ordered, *next; 759 u64 count = 0; 760 u64 range_start, range_len; 761 u64 range_end; 762 763 if (bg) { 764 range_start = bg->start; 765 range_len = bg->length; 766 } else { 767 range_start = 0; 768 range_len = U64_MAX; 769 } 770 range_end = range_start + range_len; 771 772 mutex_lock(&root->ordered_extent_mutex); 773 spin_lock(&root->ordered_extent_lock); 774 list_splice_init(&root->ordered_extents, &splice); 775 while (!list_empty(&splice) && nr) { 776 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 777 root_extent_list); 778 779 if (range_end <= ordered->disk_bytenr || 780 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 781 list_move_tail(&ordered->root_extent_list, &skipped); 782 cond_resched_lock(&root->ordered_extent_lock); 783 continue; 784 } 785 786 list_move_tail(&ordered->root_extent_list, 787 &root->ordered_extents); 788 refcount_inc(&ordered->refs); 789 spin_unlock(&root->ordered_extent_lock); 790 791 btrfs_init_work(&ordered->flush_work, 792 btrfs_run_ordered_extent_work, NULL); 793 list_add_tail(&ordered->work_list, &works); 794 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 795 796 cond_resched(); 797 if (nr != U64_MAX) 798 nr--; 799 count++; 800 spin_lock(&root->ordered_extent_lock); 801 } 802 list_splice_tail(&skipped, &root->ordered_extents); 803 list_splice_tail(&splice, &root->ordered_extents); 804 spin_unlock(&root->ordered_extent_lock); 805 806 list_for_each_entry_safe(ordered, next, &works, work_list) { 807 list_del_init(&ordered->work_list); 808 wait_for_completion(&ordered->completion); 809 btrfs_put_ordered_extent(ordered); 810 cond_resched(); 811 } 812 mutex_unlock(&root->ordered_extent_mutex); 813 814 return count; 815 } 816 817 /* 818 * Wait for @nr ordered extents that intersect the @bg, or the whole range of 819 * the filesystem if @bg is NULL. 820 */ 821 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 822 const struct btrfs_block_group *bg) 823 { 824 struct btrfs_root *root; 825 LIST_HEAD(splice); 826 u64 done; 827 828 mutex_lock(&fs_info->ordered_operations_mutex); 829 spin_lock(&fs_info->ordered_root_lock); 830 list_splice_init(&fs_info->ordered_roots, &splice); 831 while (!list_empty(&splice) && nr) { 832 root = list_first_entry(&splice, struct btrfs_root, 833 ordered_root); 834 root = btrfs_grab_root(root); 835 BUG_ON(!root); 836 list_move_tail(&root->ordered_root, 837 &fs_info->ordered_roots); 838 spin_unlock(&fs_info->ordered_root_lock); 839 840 done = btrfs_wait_ordered_extents(root, nr, bg); 841 btrfs_put_root(root); 842 843 if (nr != U64_MAX) 844 nr -= done; 845 846 spin_lock(&fs_info->ordered_root_lock); 847 } 848 list_splice_tail(&splice, &fs_info->ordered_roots); 849 spin_unlock(&fs_info->ordered_root_lock); 850 mutex_unlock(&fs_info->ordered_operations_mutex); 851 } 852 853 /* 854 * Start IO and wait for a given ordered extent to finish. 855 * 856 * Wait on page writeback for all the pages in the extent but not in 857 * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the 858 * IO completion code to insert metadata into the btree corresponding to the extent. 859 */ 860 void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, 861 u64 nowriteback_start, u32 nowriteback_len) 862 { 863 u64 start = entry->file_offset; 864 u64 end = start + entry->num_bytes - 1; 865 struct btrfs_inode *inode = entry->inode; 866 bool freespace_inode; 867 868 trace_btrfs_ordered_extent_start(inode, entry); 869 870 /* 871 * If this is a free space inode do not take the ordered extents lockdep 872 * map. 873 */ 874 freespace_inode = btrfs_is_free_space_inode(inode); 875 876 /* 877 * pages in the range can be dirty, clean or writeback. We 878 * start IO on any dirty ones so the wait doesn't stall waiting 879 * for the flusher thread to find them 880 */ 881 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { 882 if (!nowriteback_len) { 883 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); 884 } else { 885 if (start < nowriteback_start) 886 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, 887 nowriteback_start - 1); 888 if (nowriteback_start + nowriteback_len < end) 889 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, 890 nowriteback_start + nowriteback_len, 891 end); 892 } 893 } 894 895 if (!freespace_inode) 896 btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); 897 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); 898 } 899 900 /* 901 * Used to wait on ordered extents across a large range of bytes. 902 */ 903 int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len) 904 { 905 int ret = 0; 906 int ret_wb = 0; 907 u64 end; 908 u64 orig_end; 909 struct btrfs_ordered_extent *ordered; 910 911 if (start + len < start) { 912 orig_end = OFFSET_MAX; 913 } else { 914 orig_end = start + len - 1; 915 if (orig_end > OFFSET_MAX) 916 orig_end = OFFSET_MAX; 917 } 918 919 /* start IO across the range first to instantiate any delalloc 920 * extents 921 */ 922 ret = btrfs_fdatawrite_range(inode, start, orig_end); 923 if (ret) 924 return ret; 925 926 /* 927 * If we have a writeback error don't return immediately. Wait first 928 * for any ordered extents that haven't completed yet. This is to make 929 * sure no one can dirty the same page ranges and call writepages() 930 * before the ordered extents complete - to avoid failures (-EEXIST) 931 * when adding the new ordered extents to the ordered tree. 932 */ 933 ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end); 934 935 end = orig_end; 936 while (1) { 937 ordered = btrfs_lookup_first_ordered_extent(inode, end); 938 if (!ordered) 939 break; 940 if (ordered->file_offset > orig_end) { 941 btrfs_put_ordered_extent(ordered); 942 break; 943 } 944 if (ordered->file_offset + ordered->num_bytes <= start) { 945 btrfs_put_ordered_extent(ordered); 946 break; 947 } 948 btrfs_start_ordered_extent(ordered); 949 end = ordered->file_offset; 950 /* 951 * If the ordered extent had an error save the error but don't 952 * exit without waiting first for all other ordered extents in 953 * the range to complete. 954 */ 955 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 956 ret = -EIO; 957 btrfs_put_ordered_extent(ordered); 958 if (end == 0 || end == start) 959 break; 960 end--; 961 } 962 return ret_wb ? ret_wb : ret; 963 } 964 965 /* 966 * find an ordered extent corresponding to file_offset. return NULL if 967 * nothing is found, otherwise take a reference on the extent and return it 968 */ 969 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, 970 u64 file_offset) 971 { 972 struct rb_node *node; 973 struct btrfs_ordered_extent *entry = NULL; 974 unsigned long flags; 975 976 spin_lock_irqsave(&inode->ordered_tree_lock, flags); 977 node = ordered_tree_search(inode, file_offset); 978 if (!node) 979 goto out; 980 981 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 982 if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) 983 entry = NULL; 984 if (entry) { 985 refcount_inc(&entry->refs); 986 trace_btrfs_ordered_extent_lookup(inode, entry); 987 } 988 out: 989 spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); 990 return entry; 991 } 992 993 /* Since the DIO code tries to lock a wide area we need to look for any ordered 994 * extents that exist in the range, rather than just the start of the range. 995 */ 996 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 997 struct btrfs_inode *inode, u64 file_offset, u64 len) 998 { 999 struct rb_node *node; 1000 struct btrfs_ordered_extent *entry = NULL; 1001 1002 spin_lock_irq(&inode->ordered_tree_lock); 1003 node = ordered_tree_search(inode, file_offset); 1004 if (!node) { 1005 node = ordered_tree_search(inode, file_offset + len); 1006 if (!node) 1007 goto out; 1008 } 1009 1010 while (1) { 1011 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1012 if (btrfs_range_overlaps(entry, file_offset, len)) 1013 break; 1014 1015 if (entry->file_offset >= file_offset + len) { 1016 entry = NULL; 1017 break; 1018 } 1019 entry = NULL; 1020 node = rb_next(node); 1021 if (!node) 1022 break; 1023 } 1024 out: 1025 if (entry) { 1026 refcount_inc(&entry->refs); 1027 trace_btrfs_ordered_extent_lookup_range(inode, entry); 1028 } 1029 spin_unlock_irq(&inode->ordered_tree_lock); 1030 return entry; 1031 } 1032 1033 /* 1034 * Adds all ordered extents to the given list. The list ends up sorted by the 1035 * file_offset of the ordered extents. 1036 */ 1037 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, 1038 struct list_head *list) 1039 { 1040 struct rb_node *n; 1041 1042 btrfs_assert_inode_locked(inode); 1043 1044 spin_lock_irq(&inode->ordered_tree_lock); 1045 for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { 1046 struct btrfs_ordered_extent *ordered; 1047 1048 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 1049 1050 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) 1051 continue; 1052 1053 ASSERT(list_empty(&ordered->log_list)); 1054 list_add_tail(&ordered->log_list, list); 1055 refcount_inc(&ordered->refs); 1056 trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); 1057 } 1058 spin_unlock_irq(&inode->ordered_tree_lock); 1059 } 1060 1061 /* 1062 * lookup and return any extent before 'file_offset'. NULL is returned 1063 * if none is found 1064 */ 1065 struct btrfs_ordered_extent * 1066 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) 1067 { 1068 struct rb_node *node; 1069 struct btrfs_ordered_extent *entry = NULL; 1070 1071 spin_lock_irq(&inode->ordered_tree_lock); 1072 node = ordered_tree_search(inode, file_offset); 1073 if (!node) 1074 goto out; 1075 1076 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1077 refcount_inc(&entry->refs); 1078 trace_btrfs_ordered_extent_lookup_first(inode, entry); 1079 out: 1080 spin_unlock_irq(&inode->ordered_tree_lock); 1081 return entry; 1082 } 1083 1084 /* 1085 * Lookup the first ordered extent that overlaps the range 1086 * [@file_offset, @file_offset + @len). 1087 * 1088 * The difference between this and btrfs_lookup_first_ordered_extent() is 1089 * that this one won't return any ordered extent that does not overlap the range. 1090 * And the difference against btrfs_lookup_ordered_extent() is, this function 1091 * ensures the first ordered extent gets returned. 1092 */ 1093 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 1094 struct btrfs_inode *inode, u64 file_offset, u64 len) 1095 { 1096 struct rb_node *node; 1097 struct rb_node *cur; 1098 struct rb_node *prev; 1099 struct rb_node *next; 1100 struct btrfs_ordered_extent *entry = NULL; 1101 1102 spin_lock_irq(&inode->ordered_tree_lock); 1103 node = inode->ordered_tree.rb_node; 1104 /* 1105 * Here we don't want to use tree_search() which will use tree->last 1106 * and screw up the search order. 1107 * And __tree_search() can't return the adjacent ordered extents 1108 * either, thus here we do our own search. 1109 */ 1110 while (node) { 1111 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1112 1113 if (file_offset < entry->file_offset) { 1114 node = node->rb_left; 1115 } else if (file_offset >= entry_end(entry)) { 1116 node = node->rb_right; 1117 } else { 1118 /* 1119 * Direct hit, got an ordered extent that starts at 1120 * @file_offset 1121 */ 1122 goto out; 1123 } 1124 } 1125 if (!entry) { 1126 /* Empty tree */ 1127 goto out; 1128 } 1129 1130 cur = &entry->rb_node; 1131 /* We got an entry around @file_offset, check adjacent entries */ 1132 if (entry->file_offset < file_offset) { 1133 prev = cur; 1134 next = rb_next(cur); 1135 } else { 1136 prev = rb_prev(cur); 1137 next = cur; 1138 } 1139 if (prev) { 1140 entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); 1141 if (btrfs_range_overlaps(entry, file_offset, len)) 1142 goto out; 1143 } 1144 if (next) { 1145 entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); 1146 if (btrfs_range_overlaps(entry, file_offset, len)) 1147 goto out; 1148 } 1149 /* No ordered extent in the range */ 1150 entry = NULL; 1151 out: 1152 if (entry) { 1153 refcount_inc(&entry->refs); 1154 trace_btrfs_ordered_extent_lookup_first_range(inode, entry); 1155 } 1156 1157 spin_unlock_irq(&inode->ordered_tree_lock); 1158 return entry; 1159 } 1160 1161 /* 1162 * Lock the passed range and ensures all pending ordered extents in it are run 1163 * to completion. 1164 * 1165 * @inode: Inode whose ordered tree is to be searched 1166 * @start: Beginning of range to flush 1167 * @end: Last byte of range to lock 1168 * @cached_state: If passed, will return the extent state responsible for the 1169 * locked range. It's the caller's responsibility to free the 1170 * cached state. 1171 * 1172 * Always return with the given range locked, ensuring after it's called no 1173 * order extent can be pending. 1174 */ 1175 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, 1176 u64 end, 1177 struct extent_state **cached_state) 1178 { 1179 struct btrfs_ordered_extent *ordered; 1180 struct extent_state *cache = NULL; 1181 struct extent_state **cachedp = &cache; 1182 1183 if (cached_state) 1184 cachedp = cached_state; 1185 1186 while (1) { 1187 btrfs_lock_extent(&inode->io_tree, start, end, cachedp); 1188 ordered = btrfs_lookup_ordered_range(inode, start, 1189 end - start + 1); 1190 if (!ordered) { 1191 /* 1192 * If no external cached_state has been passed then 1193 * decrement the extra ref taken for cachedp since we 1194 * aren't exposing it outside of this function 1195 */ 1196 if (!cached_state) 1197 refcount_dec(&cache->refs); 1198 break; 1199 } 1200 btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); 1201 btrfs_start_ordered_extent(ordered); 1202 btrfs_put_ordered_extent(ordered); 1203 } 1204 } 1205 1206 /* 1207 * Lock the passed range and ensure all pending ordered extents in it are run 1208 * to completion in nowait mode. 1209 * 1210 * Return true if btrfs_lock_ordered_range does not return any extents, 1211 * otherwise false. 1212 */ 1213 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, 1214 struct extent_state **cached_state) 1215 { 1216 struct btrfs_ordered_extent *ordered; 1217 1218 if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) 1219 return false; 1220 1221 ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); 1222 if (!ordered) 1223 return true; 1224 1225 btrfs_put_ordered_extent(ordered); 1226 btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); 1227 1228 return false; 1229 } 1230 1231 /* Split out a new ordered extent for this first @len bytes of @ordered. */ 1232 struct btrfs_ordered_extent *btrfs_split_ordered_extent( 1233 struct btrfs_ordered_extent *ordered, u64 len) 1234 { 1235 struct btrfs_inode *inode = ordered->inode; 1236 struct btrfs_root *root = inode->root; 1237 struct btrfs_fs_info *fs_info = root->fs_info; 1238 u64 file_offset = ordered->file_offset; 1239 u64 disk_bytenr = ordered->disk_bytenr; 1240 unsigned long flags = ordered->flags; 1241 struct btrfs_ordered_sum *sum, *tmpsum; 1242 struct btrfs_ordered_extent *new; 1243 struct rb_node *node; 1244 u64 offset = 0; 1245 1246 trace_btrfs_ordered_extent_split(inode, ordered); 1247 1248 ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); 1249 1250 /* 1251 * The entire bio must be covered by the ordered extent, but we can't 1252 * reduce the original extent to a zero length either. 1253 */ 1254 if (WARN_ON_ONCE(len >= ordered->num_bytes)) 1255 return ERR_PTR(-EINVAL); 1256 /* 1257 * If our ordered extent had an error there's no point in continuing. 1258 * The error may have come from a transaction abort done either by this 1259 * task or some other concurrent task, and the transaction abort path 1260 * iterates over all existing ordered extents and sets the flag 1261 * BTRFS_ORDERED_IOERR on them. 1262 */ 1263 if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { 1264 const int fs_error = BTRFS_FS_ERROR(fs_info); 1265 1266 return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); 1267 } 1268 /* We cannot split partially completed ordered extents. */ 1269 if (ordered->bytes_left) { 1270 ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); 1271 if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) 1272 return ERR_PTR(-EINVAL); 1273 } 1274 /* We cannot split a compressed ordered extent. */ 1275 if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) 1276 return ERR_PTR(-EINVAL); 1277 1278 new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, 1279 len, 0, flags, ordered->compress_type); 1280 if (IS_ERR(new)) 1281 return new; 1282 1283 /* One ref for the tree. */ 1284 refcount_inc(&new->refs); 1285 1286 /* 1287 * Take the root's ordered_extent_lock to avoid a race with 1288 * btrfs_wait_ordered_extents() when updating the disk_bytenr and 1289 * disk_num_bytes fields of the ordered extent below. And we disable 1290 * IRQs because the inode's ordered_tree_lock is used in IRQ context 1291 * elsewhere. 1292 * 1293 * There's no concern about a previous caller of 1294 * btrfs_wait_ordered_extents() getting the trimmed ordered extent 1295 * before we insert the new one, because even if it gets the ordered 1296 * extent before it's trimmed and the new one inserted, right before it 1297 * uses it or during its use, the ordered extent might have been 1298 * trimmed in the meanwhile, and it missed the new ordered extent. 1299 * There's no way around this and it's harmless for current use cases, 1300 * so we take the root's ordered_extent_lock to fix that race during 1301 * trimming and silence tools like KCSAN. 1302 */ 1303 spin_lock_irq(&root->ordered_extent_lock); 1304 spin_lock(&inode->ordered_tree_lock); 1305 1306 /* 1307 * We don't have overlapping ordered extents (that would imply double 1308 * allocation of extents) and we checked above that the split length 1309 * does not cross the ordered extent's num_bytes field, so there's 1310 * no need to remove it and re-insert it in the tree. 1311 */ 1312 ordered->file_offset += len; 1313 ordered->disk_bytenr += len; 1314 ordered->num_bytes -= len; 1315 ordered->disk_num_bytes -= len; 1316 ordered->ram_bytes -= len; 1317 1318 if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { 1319 ASSERT(ordered->bytes_left == 0); 1320 new->bytes_left = 0; 1321 } else { 1322 ordered->bytes_left -= len; 1323 } 1324 1325 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { 1326 if (ordered->truncated_len > len) { 1327 ordered->truncated_len -= len; 1328 } else { 1329 new->truncated_len = ordered->truncated_len; 1330 ordered->truncated_len = 0; 1331 } 1332 } 1333 1334 list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { 1335 if (offset == len) 1336 break; 1337 list_move_tail(&sum->list, &new->list); 1338 offset += sum->len; 1339 } 1340 1341 node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); 1342 if (unlikely(node)) 1343 btrfs_panic(fs_info, -EEXIST, 1344 "inconsistency in ordered tree at offset %llu after split", 1345 new->file_offset); 1346 spin_unlock(&inode->ordered_tree_lock); 1347 1348 list_add_tail(&new->root_extent_list, &root->ordered_extents); 1349 root->nr_ordered_extents++; 1350 spin_unlock_irq(&root->ordered_extent_lock); 1351 return new; 1352 } 1353 1354 int __init ordered_data_init(void) 1355 { 1356 btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0); 1357 if (!btrfs_ordered_extent_cache) 1358 return -ENOMEM; 1359 1360 return 0; 1361 } 1362 1363 void __cold ordered_data_exit(void) 1364 { 1365 kmem_cache_destroy(btrfs_ordered_extent_cache); 1366 } 1367