1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2020 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_btree.h" 17 #include "xfs_trace.h" 18 #include "xfs_btree_staging.h" 19 20 /* 21 * Staging Cursors and Fake Roots for Btrees 22 * ========================================= 23 * 24 * A staging btree cursor is a special type of btree cursor that callers must 25 * use to construct a new btree index using the btree bulk loader code. The 26 * bulk loading code uses the staging btree cursor to abstract the details of 27 * initializing new btree blocks and filling them with records or key/ptr 28 * pairs. Regular btree operations (e.g. queries and modifications) are not 29 * supported with staging cursors, and callers must not invoke them. 30 * 31 * Fake root structures contain all the information about a btree that is under 32 * construction by the bulk loading code. Staging btree cursors point to fake 33 * root structures instead of the usual AG header or inode structure. 34 * 35 * Callers are expected to initialize a fake root structure and pass it into 36 * the _stage_cursor function for a specific btree type. When bulk loading is 37 * complete, callers should call the _commit_staged_btree function for that 38 * specific btree type to commit the new btree into the filesystem. 39 */ 40 41 /* 42 * Bulk Loading for AG Btrees 43 * ========================== 44 * 45 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the 46 * staging cursor. Callers should initialize this to zero. 47 * 48 * The _stage_cursor() function for a specific btree type should call 49 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging 50 * cursor. The corresponding _commit_staged_btree() function should log the 51 * new root and call xfs_btree_commit_afakeroot() to transform the staging 52 * cursor into a regular btree cursor. 53 */ 54 55 /* 56 * Initialize a AG-rooted btree cursor with the given AG btree fake root. 57 */ 58 void 59 xfs_btree_stage_afakeroot( 60 struct xfs_btree_cur *cur, 61 struct xbtree_afakeroot *afake) 62 { 63 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 64 ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); 65 ASSERT(cur->bc_tp == NULL); 66 67 cur->bc_ag.afake = afake; 68 cur->bc_nlevels = afake->af_levels; 69 cur->bc_flags |= XFS_BTREE_STAGING; 70 } 71 72 /* 73 * Transform an AG-rooted staging btree cursor back into a regular cursor by 74 * substituting a real btree root for the fake one and restoring normal btree 75 * cursor ops. The caller must log the btree root change prior to calling 76 * this. 77 */ 78 void 79 xfs_btree_commit_afakeroot( 80 struct xfs_btree_cur *cur, 81 struct xfs_trans *tp, 82 struct xfs_buf *agbp) 83 { 84 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 85 ASSERT(cur->bc_tp == NULL); 86 87 trace_xfs_btree_commit_afakeroot(cur); 88 89 cur->bc_ag.afake = NULL; 90 cur->bc_ag.agbp = agbp; 91 cur->bc_flags &= ~XFS_BTREE_STAGING; 92 cur->bc_tp = tp; 93 } 94 95 /* 96 * Bulk Loading for Inode-Rooted Btrees 97 * ==================================== 98 * 99 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to 100 * the staging cursor. This structure should be initialized as follows: 101 * 102 * - if_fork_size field should be set to the number of bytes available to the 103 * fork in the inode. 104 * 105 * - if_fork should point to a freshly allocated struct xfs_ifork. 106 * 107 * - if_format should be set to the appropriate fork type (e.g. 108 * XFS_DINODE_FMT_BTREE). 109 * 110 * All other fields must be zero. 111 * 112 * The _stage_cursor() function for a specific btree type should call 113 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging 114 * cursor. The corresponding _commit_staged_btree() function should log the 115 * new root and call xfs_btree_commit_ifakeroot() to transform the staging 116 * cursor into a regular btree cursor. 117 */ 118 119 /* 120 * Initialize an inode-rooted btree cursor with the given inode btree fake 121 * root. The btree cursor's bc_ops will be overridden as needed to make the 122 * staging functionality work. If new_ops is not NULL, these new ops will be 123 * passed out to the caller for further overriding. 124 */ 125 void 126 xfs_btree_stage_ifakeroot( 127 struct xfs_btree_cur *cur, 128 struct xbtree_ifakeroot *ifake) 129 { 130 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 131 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 132 ASSERT(cur->bc_tp == NULL); 133 134 cur->bc_ino.ifake = ifake; 135 cur->bc_nlevels = ifake->if_levels; 136 cur->bc_ino.forksize = ifake->if_fork_size; 137 cur->bc_ino.whichfork = XFS_STAGING_FORK; 138 cur->bc_flags |= XFS_BTREE_STAGING; 139 } 140 141 /* 142 * Transform an inode-rooted staging btree cursor back into a regular cursor by 143 * substituting a real btree root for the fake one and restoring normal btree 144 * cursor ops. The caller must log the btree root change prior to calling 145 * this. 146 */ 147 void 148 xfs_btree_commit_ifakeroot( 149 struct xfs_btree_cur *cur, 150 struct xfs_trans *tp, 151 int whichfork) 152 { 153 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 154 ASSERT(cur->bc_tp == NULL); 155 156 trace_xfs_btree_commit_ifakeroot(cur); 157 158 cur->bc_ino.ifake = NULL; 159 cur->bc_ino.whichfork = whichfork; 160 cur->bc_flags &= ~XFS_BTREE_STAGING; 161 cur->bc_tp = tp; 162 } 163 164 /* 165 * Bulk Loading of Staged Btrees 166 * ============================= 167 * 168 * This interface is used with a staged btree cursor to create a totally new 169 * btree with a large number of records (i.e. more than what would fit in a 170 * single root block). When the creation is complete, the new root can be 171 * linked atomically into the filesystem by committing the staged cursor. 172 * 173 * Creation of a new btree proceeds roughly as follows: 174 * 175 * The first step is to initialize an appropriate fake btree root structure and 176 * then construct a staged btree cursor. Refer to the block comments about 177 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for 178 * more information about how to do this. 179 * 180 * The second step is to initialize a struct xfs_btree_bload context as 181 * documented in the structure definition. 182 * 183 * The third step is to call xfs_btree_bload_compute_geometry to compute the 184 * height of and the number of blocks needed to construct the btree. See the 185 * section "Computing the Geometry of the New Btree" for details about this 186 * computation. 187 * 188 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and 189 * save them for later use by ->claim_block(). Bulk loading requires all 190 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a 191 * rebuild, and to minimize seek distances of the new btree. 192 * 193 * Step five is to call xfs_btree_bload() to start constructing the btree. 194 * 195 * The final step is to commit the staging btree cursor, which logs the new 196 * btree root and turns the staging cursor into a regular cursor. The caller 197 * is responsible for cleaning up the previous btree blocks, if any. 198 * 199 * Computing the Geometry of the New Btree 200 * ======================================= 201 * 202 * The number of items placed in each btree block is computed via the following 203 * algorithm: For leaf levels, the number of items for the level is nr_records 204 * in the bload structure. For node levels, the number of items for the level 205 * is the number of blocks in the next lower level of the tree. For each 206 * level, the desired number of items per block is defined as: 207 * 208 * desired = max(minrecs, maxrecs - slack factor) 209 * 210 * The number of blocks for the level is defined to be: 211 * 212 * blocks = floor(nr_items / desired) 213 * 214 * Note this is rounded down so that the npb calculation below will never fall 215 * below minrecs. The number of items that will actually be loaded into each 216 * btree block is defined as: 217 * 218 * npb = nr_items / blocks 219 * 220 * Some of the leftmost blocks in the level will contain one extra record as 221 * needed to handle uneven division. If the number of records in any block 222 * would exceed maxrecs for that level, blocks is incremented and npb is 223 * recalculated. 224 * 225 * In other words, we compute the number of blocks needed to satisfy a given 226 * loading level, then spread the items as evenly as possible. 227 * 228 * The height and number of fs blocks required to create the btree are computed 229 * and returned via btree_height and nr_blocks. 230 */ 231 232 /* 233 * Put a btree block that we're loading onto the ordered list and release it. 234 * The btree blocks will be written to disk when bulk loading is finished. 235 * If we reach the dirty buffer threshold, flush them to disk before 236 * continuing. 237 */ 238 static int 239 xfs_btree_bload_drop_buf( 240 struct xfs_btree_bload *bbl, 241 struct list_head *buffers_list, 242 struct xfs_buf **bpp) 243 { 244 struct xfs_buf *bp = *bpp; 245 int error; 246 247 if (!bp) 248 return 0; 249 250 /* 251 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent 252 * xfs_buf_read will not pointlessly reread the contents from the disk. 253 */ 254 bp->b_flags |= XBF_DONE; 255 256 xfs_buf_delwri_queue_here(bp, buffers_list); 257 xfs_buf_relse(bp); 258 *bpp = NULL; 259 bbl->nr_dirty++; 260 261 if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) 262 return 0; 263 264 error = xfs_buf_delwri_submit(buffers_list); 265 if (error) 266 return error; 267 268 bbl->nr_dirty = 0; 269 return 0; 270 } 271 272 /* 273 * Allocate and initialize one btree block for bulk loading. 274 * 275 * The new btree block will have its level and numrecs fields set to the values 276 * of the level and nr_this_block parameters, respectively. 277 * 278 * The caller should ensure that ptrp, bpp, and blockp refer to the left 279 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp 280 * will all point to the new block. 281 */ 282 STATIC int 283 xfs_btree_bload_prep_block( 284 struct xfs_btree_cur *cur, 285 struct xfs_btree_bload *bbl, 286 struct list_head *buffers_list, 287 unsigned int level, 288 unsigned int nr_this_block, 289 union xfs_btree_ptr *ptrp, /* in/out */ 290 struct xfs_buf **bpp, /* in/out */ 291 struct xfs_btree_block **blockp, /* in/out */ 292 void *priv) 293 { 294 union xfs_btree_ptr new_ptr; 295 struct xfs_buf *new_bp; 296 struct xfs_btree_block *new_block; 297 int ret; 298 299 if (xfs_btree_at_iroot(cur, level)) { 300 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 301 size_t new_size; 302 303 ASSERT(*bpp == NULL); 304 305 /* Allocate a new incore btree root block. */ 306 new_size = bbl->iroot_size(cur, level, nr_this_block, priv); 307 ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); 308 ifp->if_broot_bytes = (int)new_size; 309 310 /* Initialize it and send it out. */ 311 xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, 312 level, nr_this_block, cur->bc_ino.ip->i_ino); 313 314 *bpp = NULL; 315 *blockp = ifp->if_broot; 316 xfs_btree_set_ptr_null(cur, ptrp); 317 return 0; 318 } 319 320 /* Claim one of the caller's preallocated blocks. */ 321 xfs_btree_set_ptr_null(cur, &new_ptr); 322 ret = bbl->claim_block(cur, &new_ptr, priv); 323 if (ret) 324 return ret; 325 326 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); 327 328 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); 329 if (ret) 330 return ret; 331 332 /* 333 * The previous block (if any) is the left sibling of the new block, 334 * so set its right sibling pointer to the new block and drop it. 335 */ 336 if (*blockp) 337 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); 338 339 ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); 340 if (ret) 341 return ret; 342 343 /* Initialize the new btree block. */ 344 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); 345 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); 346 347 /* Set the out parameters. */ 348 *bpp = new_bp; 349 *blockp = new_block; 350 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); 351 return 0; 352 } 353 354 /* Load one leaf block. */ 355 STATIC int 356 xfs_btree_bload_leaf( 357 struct xfs_btree_cur *cur, 358 unsigned int recs_this_block, 359 xfs_btree_bload_get_records_fn get_records, 360 struct xfs_btree_block *block, 361 void *priv) 362 { 363 unsigned int j = 1; 364 int ret; 365 366 /* Fill the leaf block with records. */ 367 while (j <= recs_this_block) { 368 ret = get_records(cur, j, block, recs_this_block - j + 1, priv); 369 if (ret < 0) 370 return ret; 371 j += ret; 372 } 373 374 return 0; 375 } 376 377 /* 378 * Load one node block with key/ptr pairs. 379 * 380 * child_ptr must point to a block within the next level down in the tree. A 381 * key/ptr entry will be created in the new node block to the block pointed to 382 * by child_ptr. On exit, child_ptr points to the next block on the child 383 * level that needs processing. 384 */ 385 STATIC int 386 xfs_btree_bload_node( 387 struct xfs_btree_cur *cur, 388 unsigned int recs_this_block, 389 union xfs_btree_ptr *child_ptr, 390 struct xfs_btree_block *block) 391 { 392 unsigned int j; 393 int ret; 394 395 /* Fill the node block with keys and pointers. */ 396 for (j = 1; j <= recs_this_block; j++) { 397 union xfs_btree_key child_key; 398 union xfs_btree_ptr *block_ptr; 399 union xfs_btree_key *block_key; 400 struct xfs_btree_block *child_block; 401 struct xfs_buf *child_bp; 402 403 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); 404 405 /* 406 * Read the lower-level block in case the buffer for it has 407 * been reclaimed. LRU refs will be set on the block, which is 408 * desirable if the new btree commits. 409 */ 410 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, 411 &child_bp); 412 if (ret) 413 return ret; 414 415 block_ptr = xfs_btree_ptr_addr(cur, j, block); 416 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); 417 418 block_key = xfs_btree_key_addr(cur, j, block); 419 xfs_btree_get_keys(cur, child_block, &child_key); 420 xfs_btree_copy_keys(cur, block_key, &child_key, 1); 421 422 xfs_btree_get_sibling(cur, child_block, child_ptr, 423 XFS_BB_RIGHTSIB); 424 xfs_buf_relse(child_bp); 425 } 426 427 return 0; 428 } 429 430 /* 431 * Compute the maximum number of records (or keyptrs) per block that we want to 432 * install at this level in the btree. Caller is responsible for having set 433 * @cur->bc_ino.forksize to the desired fork size, if appropriate. 434 */ 435 STATIC unsigned int 436 xfs_btree_bload_max_npb( 437 struct xfs_btree_cur *cur, 438 struct xfs_btree_bload *bbl, 439 unsigned int level) 440 { 441 unsigned int ret; 442 443 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) 444 return cur->bc_ops->get_dmaxrecs(cur, level); 445 446 ret = cur->bc_ops->get_maxrecs(cur, level); 447 if (level == 0) 448 ret -= bbl->leaf_slack; 449 else 450 ret -= bbl->node_slack; 451 return ret; 452 } 453 454 /* 455 * Compute the desired number of records (or keyptrs) per block that we want to 456 * install at this level in the btree, which must be somewhere between minrecs 457 * and max_npb. The caller is free to install fewer records per block. 458 */ 459 STATIC unsigned int 460 xfs_btree_bload_desired_npb( 461 struct xfs_btree_cur *cur, 462 struct xfs_btree_bload *bbl, 463 unsigned int level) 464 { 465 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); 466 467 /* Root blocks are not subject to minrecs rules. */ 468 if (level == cur->bc_nlevels - 1) 469 return max(1U, npb); 470 471 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); 472 } 473 474 /* 475 * Compute the number of records to be stored in each block at this level and 476 * the number of blocks for this level. For leaf levels, we must populate an 477 * empty root block even if there are no records, so we have to have at least 478 * one block. 479 */ 480 STATIC void 481 xfs_btree_bload_level_geometry( 482 struct xfs_btree_cur *cur, 483 struct xfs_btree_bload *bbl, 484 unsigned int level, 485 uint64_t nr_this_level, 486 unsigned int *avg_per_block, 487 uint64_t *blocks, 488 uint64_t *blocks_with_extra) 489 { 490 uint64_t npb; 491 uint64_t dontcare; 492 unsigned int desired_npb; 493 unsigned int maxnr; 494 495 /* 496 * Compute the absolute maximum number of records that we can store in 497 * the ondisk block or inode root. 498 */ 499 if (cur->bc_ops->get_dmaxrecs) 500 maxnr = cur->bc_ops->get_dmaxrecs(cur, level); 501 else 502 maxnr = cur->bc_ops->get_maxrecs(cur, level); 503 504 /* 505 * Compute the number of blocks we need to fill each block with the 506 * desired number of records/keyptrs per block. Because desired_npb 507 * could be minrecs, we use regular integer division (which rounds 508 * the block count down) so that in the next step the effective # of 509 * items per block will never be less than desired_npb. 510 */ 511 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); 512 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); 513 *blocks = max(1ULL, *blocks); 514 515 /* 516 * Compute the number of records that we will actually put in each 517 * block, assuming that we want to spread the records evenly between 518 * the blocks. Take care that the effective # of items per block (npb) 519 * won't exceed maxrecs even for the blocks that get an extra record, 520 * since desired_npb could be maxrecs, and in the previous step we 521 * rounded the block count down. 522 */ 523 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 524 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { 525 (*blocks)++; 526 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 527 } 528 529 *avg_per_block = min_t(uint64_t, npb, nr_this_level); 530 531 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, 532 *avg_per_block, desired_npb, *blocks, 533 *blocks_with_extra); 534 } 535 536 /* 537 * Ensure a slack value is appropriate for the btree. 538 * 539 * If the slack value is negative, set slack so that we fill the block to 540 * halfway between minrecs and maxrecs. Make sure the slack is never so large 541 * that we can underflow minrecs. 542 */ 543 static void 544 xfs_btree_bload_ensure_slack( 545 struct xfs_btree_cur *cur, 546 int *slack, 547 int level) 548 { 549 int maxr; 550 int minr; 551 552 maxr = cur->bc_ops->get_maxrecs(cur, level); 553 minr = cur->bc_ops->get_minrecs(cur, level); 554 555 /* 556 * If slack is negative, automatically set slack so that we load the 557 * btree block approximately halfway between minrecs and maxrecs. 558 * Generally, this will net us 75% loading. 559 */ 560 if (*slack < 0) 561 *slack = maxr - ((maxr + minr) >> 1); 562 563 *slack = min(*slack, maxr - minr); 564 } 565 566 /* 567 * Prepare a btree cursor for a bulk load operation by computing the geometry 568 * fields in bbl. Caller must ensure that the btree cursor is a staging 569 * cursor. This function can be called multiple times. 570 */ 571 int 572 xfs_btree_bload_compute_geometry( 573 struct xfs_btree_cur *cur, 574 struct xfs_btree_bload *bbl, 575 uint64_t nr_records) 576 { 577 const struct xfs_btree_ops *ops = cur->bc_ops; 578 uint64_t nr_blocks = 0; 579 uint64_t nr_this_level; 580 581 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 582 583 /* 584 * Make sure that the slack values make sense for traditional leaf and 585 * node blocks. Inode-rooted btrees will return different minrecs and 586 * maxrecs values for the root block (bc_nlevels == level - 1). We're 587 * checking levels 0 and 1 here, so set bc_nlevels such that the btree 588 * code doesn't interpret either as the root level. 589 */ 590 cur->bc_nlevels = cur->bc_maxlevels - 1; 591 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); 592 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); 593 594 bbl->nr_records = nr_this_level = nr_records; 595 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { 596 uint64_t level_blocks; 597 uint64_t dontcare64; 598 unsigned int level = cur->bc_nlevels - 1; 599 unsigned int avg_per_block; 600 601 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 602 &avg_per_block, &level_blocks, &dontcare64); 603 604 if (ops->type == XFS_BTREE_TYPE_INODE) { 605 /* 606 * If all the items we want to store at this level 607 * would fit in the inode root block, then we have our 608 * btree root and are done. 609 * 610 * Note that bmap btrees forbid records in the root. 611 */ 612 if ((level != 0 || 613 (ops->geom_flags & XFS_BTGEO_IROOT_RECORDS)) && 614 nr_this_level <= avg_per_block) { 615 nr_blocks++; 616 break; 617 } 618 619 /* 620 * Otherwise, we have to store all the items for this 621 * level in traditional btree blocks and therefore need 622 * another level of btree to point to those blocks. 623 * 624 * We have to re-compute the geometry for each level of 625 * an inode-rooted btree because the geometry differs 626 * between a btree root in an inode fork and a 627 * traditional btree block. 628 * 629 * This distinction is made in the btree code based on 630 * whether level == bc_nlevels - 1. Based on the 631 * previous root block size check against the root 632 * block geometry, we know that we aren't yet ready to 633 * populate the root. Increment bc_nevels and 634 * recalculate the geometry for a traditional 635 * block-based btree level. 636 */ 637 cur->bc_nlevels++; 638 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 639 xfs_btree_bload_level_geometry(cur, bbl, level, 640 nr_this_level, &avg_per_block, 641 &level_blocks, &dontcare64); 642 } else { 643 /* 644 * If all the items we want to store at this level 645 * would fit in a single root block, we're done. 646 */ 647 if (nr_this_level <= avg_per_block) { 648 nr_blocks++; 649 break; 650 } 651 652 /* Otherwise, we need another level of btree. */ 653 cur->bc_nlevels++; 654 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 655 } 656 657 nr_blocks += level_blocks; 658 nr_this_level = level_blocks; 659 } 660 661 if (cur->bc_nlevels > cur->bc_maxlevels) 662 return -EOVERFLOW; 663 664 bbl->btree_height = cur->bc_nlevels; 665 if (ops->type == XFS_BTREE_TYPE_INODE) 666 bbl->nr_blocks = nr_blocks - 1; 667 else 668 bbl->nr_blocks = nr_blocks; 669 return 0; 670 } 671 672 /* Bulk load a btree given the parameters and geometry established in bbl. */ 673 int 674 xfs_btree_bload( 675 struct xfs_btree_cur *cur, 676 struct xfs_btree_bload *bbl, 677 void *priv) 678 { 679 struct list_head buffers_list; 680 union xfs_btree_ptr child_ptr; 681 union xfs_btree_ptr ptr; 682 struct xfs_buf *bp = NULL; 683 struct xfs_btree_block *block = NULL; 684 uint64_t nr_this_level = bbl->nr_records; 685 uint64_t blocks; 686 uint64_t i; 687 uint64_t blocks_with_extra; 688 uint64_t total_blocks = 0; 689 unsigned int avg_per_block; 690 unsigned int level = 0; 691 int ret; 692 693 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 694 695 INIT_LIST_HEAD(&buffers_list); 696 cur->bc_nlevels = bbl->btree_height; 697 xfs_btree_set_ptr_null(cur, &child_ptr); 698 xfs_btree_set_ptr_null(cur, &ptr); 699 bbl->nr_dirty = 0; 700 701 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 702 &avg_per_block, &blocks, &blocks_with_extra); 703 704 /* Load each leaf block. */ 705 for (i = 0; i < blocks; i++) { 706 unsigned int nr_this_block = avg_per_block; 707 708 /* 709 * Due to rounding, btree blocks will not be evenly populated 710 * in most cases. blocks_with_extra tells us how many blocks 711 * will receive an extra record to distribute the excess across 712 * the current level as evenly as possible. 713 */ 714 if (i < blocks_with_extra) 715 nr_this_block++; 716 717 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, 718 nr_this_block, &ptr, &bp, &block, priv); 719 if (ret) 720 goto out; 721 722 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, 723 nr_this_block); 724 725 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, 726 block, priv); 727 if (ret) 728 goto out; 729 730 /* 731 * Record the leftmost leaf pointer so we know where to start 732 * with the first node level. 733 */ 734 if (i == 0) 735 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); 736 } 737 total_blocks += blocks; 738 739 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); 740 if (ret) 741 goto out; 742 743 /* Populate the internal btree nodes. */ 744 for (level = 1; level < cur->bc_nlevels; level++) { 745 union xfs_btree_ptr first_ptr; 746 747 nr_this_level = blocks; 748 block = NULL; 749 xfs_btree_set_ptr_null(cur, &ptr); 750 751 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 752 &avg_per_block, &blocks, &blocks_with_extra); 753 754 /* Load each node block. */ 755 for (i = 0; i < blocks; i++) { 756 unsigned int nr_this_block = avg_per_block; 757 758 if (i < blocks_with_extra) 759 nr_this_block++; 760 761 ret = xfs_btree_bload_prep_block(cur, bbl, 762 &buffers_list, level, nr_this_block, 763 &ptr, &bp, &block, priv); 764 if (ret) 765 goto out; 766 767 trace_xfs_btree_bload_block(cur, level, i, blocks, 768 &ptr, nr_this_block); 769 770 ret = xfs_btree_bload_node(cur, nr_this_block, 771 &child_ptr, block); 772 if (ret) 773 goto out; 774 775 /* 776 * Record the leftmost node pointer so that we know 777 * where to start the next node level above this one. 778 */ 779 if (i == 0) 780 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); 781 } 782 total_blocks += blocks; 783 784 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); 785 if (ret) 786 goto out; 787 788 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); 789 } 790 791 /* Initialize the new root. */ 792 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 793 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 794 cur->bc_ino.ifake->if_levels = cur->bc_nlevels; 795 cur->bc_ino.ifake->if_blocks = total_blocks - 1; 796 } else { 797 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); 798 cur->bc_ag.afake->af_levels = cur->bc_nlevels; 799 cur->bc_ag.afake->af_blocks = total_blocks; 800 } 801 802 /* 803 * Write the new blocks to disk. If the ordered list isn't empty after 804 * that, then something went wrong and we have to fail. This should 805 * never happen, but we'll check anyway. 806 */ 807 ret = xfs_buf_delwri_submit(&buffers_list); 808 if (ret) 809 goto out; 810 if (!list_empty(&buffers_list)) { 811 ASSERT(list_empty(&buffers_list)); 812 ret = -EIO; 813 } 814 815 out: 816 xfs_buf_delwri_cancel(&buffers_list); 817 if (bp) 818 xfs_buf_relse(bp); 819 return ret; 820 } 821