1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_btree_staging.h" 15 #include "xfs_bit.h" 16 #include "xfs_log_format.h" 17 #include "xfs_trans.h" 18 #include "xfs_sb.h" 19 #include "xfs_alloc.h" 20 #include "xfs_alloc_btree.h" 21 #include "xfs_rmap.h" 22 #include "xfs_rmap_btree.h" 23 #include "xfs_inode.h" 24 #include "xfs_refcount.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_health.h" 27 #include "xfs_bmap.h" 28 #include "xfs_ialloc.h" 29 #include "xfs_ag.h" 30 #include "scrub/xfs_scrub.h" 31 #include "scrub/scrub.h" 32 #include "scrub/common.h" 33 #include "scrub/btree.h" 34 #include "scrub/trace.h" 35 #include "scrub/repair.h" 36 #include "scrub/bitmap.h" 37 #include "scrub/agb_bitmap.h" 38 #include "scrub/xfile.h" 39 #include "scrub/xfarray.h" 40 #include "scrub/newbt.h" 41 #include "scrub/reap.h" 42 43 /* 44 * Free Space Btree Repair 45 * ======================= 46 * 47 * The reverse mappings are supposed to record all space usage for the entire 48 * AG. Therefore, we can recreate the free extent records in an AG by looking 49 * for gaps in the physical extents recorded in the rmapbt. These records are 50 * staged in @free_records. Identifying the gaps is more difficult on a 51 * reflink filesystem because rmap records are allowed to overlap. 52 * 53 * Because the final step of building a new index is to free the space used by 54 * the old index, repair needs to find that space. Unfortunately, all 55 * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share 56 * the same rmapbt owner code (OWN_AG), so this is not straightforward. 57 * 58 * The scan of the reverse mapping information records the space used by OWN_AG 59 * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While 60 * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to 61 * record all visited rmap btree blocks and all blocks owned by the AGFL. 62 * 63 * After that is where the definitions of old_allocbt_blocks shifts. This 64 * expression identifies possible former bnobt/cntbt blocks: 65 * 66 * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); 67 * 68 * Substituting from above definitions, that becomes: 69 * 70 * old_allocbt_blocks & ~not_allocbt_blocks 71 * 72 * The OWN_AG bitmap itself isn't needed after this point, so what we really do 73 * instead is: 74 * 75 * old_allocbt_blocks &= ~not_allocbt_blocks; 76 * 77 * After this point, @old_allocbt_blocks is a bitmap of alleged former 78 * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first 79 * parameter in place to avoid copying records around. 80 * 81 * Next, some of the space described by @free_records are diverted to the newbt 82 * reservation and used to format new btree blocks. The remaining records are 83 * written to the new btree indices. We reconstruct both bnobt and cntbt at 84 * the same time since we've already done all the work. 85 * 86 * We use the prefix 'xrep_abt' here because we regenerate both free space 87 * allocation btrees at the same time. 88 */ 89 90 struct xrep_abt { 91 /* Blocks owned by the rmapbt or the agfl. */ 92 struct xagb_bitmap not_allocbt_blocks; 93 94 /* All OWN_AG blocks. */ 95 struct xagb_bitmap old_allocbt_blocks; 96 97 /* 98 * New bnobt information. All btree block reservations are added to 99 * the reservation list in new_bnobt. 100 */ 101 struct xrep_newbt new_bnobt; 102 103 /* new cntbt information */ 104 struct xrep_newbt new_cntbt; 105 106 /* Free space extents. */ 107 struct xfarray *free_records; 108 109 struct xfs_scrub *sc; 110 111 /* Number of non-null records in @free_records. */ 112 uint64_t nr_real_records; 113 114 /* get_records()'s position in the free space record array. */ 115 xfarray_idx_t array_cur; 116 117 /* 118 * Next block we anticipate seeing in the rmap records. If the next 119 * rmap record is greater than next_agbno, we have found unused space. 120 */ 121 xfs_agblock_t next_agbno; 122 123 /* Number of free blocks in this AG. */ 124 xfs_agblock_t nr_blocks; 125 126 /* Longest free extent we found in the AG. */ 127 xfs_agblock_t longest; 128 }; 129 130 /* Set up to repair AG free space btrees. */ 131 int 132 xrep_setup_ag_allocbt( 133 struct xfs_scrub *sc) 134 { 135 struct xfs_group *xg = pag_group(sc->sa.pag); 136 unsigned int busy_gen; 137 138 /* 139 * Make sure the busy extent list is clear because we can't put extents 140 * on there twice. 141 */ 142 if (xfs_extent_busy_list_empty(xg, &busy_gen)) 143 return 0; 144 return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0); 145 } 146 147 /* Check for any obvious conflicts in the free extent. */ 148 STATIC int 149 xrep_abt_check_free_ext( 150 struct xfs_scrub *sc, 151 const struct xfs_alloc_rec_incore *rec) 152 { 153 enum xbtree_recpacking outcome; 154 int error; 155 156 if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) 157 return -EFSCORRUPTED; 158 159 /* Must not be an inode chunk. */ 160 error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, 161 rec->ar_startblock, rec->ar_blockcount, &outcome); 162 if (error) 163 return error; 164 if (outcome != XBTREE_RECPACKING_EMPTY) 165 return -EFSCORRUPTED; 166 167 /* Must not be shared or CoW staging. */ 168 if (sc->sa.refc_cur) { 169 error = xfs_refcount_has_records(sc->sa.refc_cur, 170 XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, 171 rec->ar_blockcount, &outcome); 172 if (error) 173 return error; 174 if (outcome != XBTREE_RECPACKING_EMPTY) 175 return -EFSCORRUPTED; 176 177 error = xfs_refcount_has_records(sc->sa.refc_cur, 178 XFS_REFC_DOMAIN_COW, rec->ar_startblock, 179 rec->ar_blockcount, &outcome); 180 if (error) 181 return error; 182 if (outcome != XBTREE_RECPACKING_EMPTY) 183 return -EFSCORRUPTED; 184 } 185 186 return 0; 187 } 188 189 /* 190 * Stash a free space record for all the space since the last bno we found 191 * all the way up to @end. 192 */ 193 static int 194 xrep_abt_stash( 195 struct xrep_abt *ra, 196 xfs_agblock_t end) 197 { 198 struct xfs_alloc_rec_incore arec = { 199 .ar_startblock = ra->next_agbno, 200 .ar_blockcount = end - ra->next_agbno, 201 }; 202 struct xfs_scrub *sc = ra->sc; 203 int error = 0; 204 205 if (xchk_should_terminate(sc, &error)) 206 return error; 207 208 error = xrep_abt_check_free_ext(ra->sc, &arec); 209 if (error) 210 return error; 211 212 trace_xrep_abt_found(sc->sa.pag, &arec); 213 214 error = xfarray_append(ra->free_records, &arec); 215 if (error) 216 return error; 217 218 ra->nr_blocks += arec.ar_blockcount; 219 return 0; 220 } 221 222 /* Record extents that aren't in use from gaps in the rmap records. */ 223 STATIC int 224 xrep_abt_walk_rmap( 225 struct xfs_btree_cur *cur, 226 const struct xfs_rmap_irec *rec, 227 void *priv) 228 { 229 struct xrep_abt *ra = priv; 230 int error; 231 232 /* Record all the OWN_AG blocks... */ 233 if (rec->rm_owner == XFS_RMAP_OWN_AG) { 234 error = xagb_bitmap_set(&ra->old_allocbt_blocks, 235 rec->rm_startblock, rec->rm_blockcount); 236 if (error) 237 return error; 238 } 239 240 /* ...and all the rmapbt blocks... */ 241 error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); 242 if (error) 243 return error; 244 245 /* ...and all the free space. */ 246 if (rec->rm_startblock > ra->next_agbno) { 247 error = xrep_abt_stash(ra, rec->rm_startblock); 248 if (error) 249 return error; 250 } 251 252 /* 253 * rmap records can overlap on reflink filesystems, so project 254 * next_agbno as far out into the AG space as we currently know about. 255 */ 256 ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, 257 rec->rm_startblock + rec->rm_blockcount); 258 return 0; 259 } 260 261 /* Collect an AGFL block for the not-to-release list. */ 262 static int 263 xrep_abt_walk_agfl( 264 struct xfs_mount *mp, 265 xfs_agblock_t agbno, 266 void *priv) 267 { 268 struct xrep_abt *ra = priv; 269 270 return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); 271 } 272 273 /* 274 * Compare two free space extents by block number. We want to sort in order of 275 * increasing block number. 276 */ 277 static int 278 xrep_bnobt_extent_cmp( 279 const void *a, 280 const void *b) 281 { 282 const struct xfs_alloc_rec_incore *ap = a; 283 const struct xfs_alloc_rec_incore *bp = b; 284 285 if (ap->ar_startblock > bp->ar_startblock) 286 return 1; 287 else if (ap->ar_startblock < bp->ar_startblock) 288 return -1; 289 return 0; 290 } 291 292 /* 293 * Re-sort the free extents by block number so that we can put the records into 294 * the bnobt in the correct order. Make sure the records do not overlap in 295 * physical space. 296 */ 297 STATIC int 298 xrep_bnobt_sort_records( 299 struct xrep_abt *ra) 300 { 301 struct xfs_alloc_rec_incore arec; 302 xfarray_idx_t cur = XFARRAY_CURSOR_INIT; 303 xfs_agblock_t next_agbno = 0; 304 int error; 305 306 error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); 307 if (error) 308 return error; 309 310 while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { 311 if (arec.ar_startblock < next_agbno) 312 return -EFSCORRUPTED; 313 314 next_agbno = arec.ar_startblock + arec.ar_blockcount; 315 } 316 317 return error; 318 } 319 320 /* 321 * Compare two free space extents by length and then block number. We want 322 * to sort first in order of increasing length and then in order of increasing 323 * block number. 324 */ 325 static int 326 xrep_cntbt_extent_cmp( 327 const void *a, 328 const void *b) 329 { 330 const struct xfs_alloc_rec_incore *ap = a; 331 const struct xfs_alloc_rec_incore *bp = b; 332 333 if (ap->ar_blockcount > bp->ar_blockcount) 334 return 1; 335 else if (ap->ar_blockcount < bp->ar_blockcount) 336 return -1; 337 return xrep_bnobt_extent_cmp(a, b); 338 } 339 340 /* 341 * Sort the free extents by length so so that we can put the records into the 342 * cntbt in the correct order. Don't let userspace kill us if we're resorting 343 * after allocating btree blocks. 344 */ 345 STATIC int 346 xrep_cntbt_sort_records( 347 struct xrep_abt *ra, 348 bool is_resort) 349 { 350 return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, 351 is_resort ? 0 : XFARRAY_SORT_KILLABLE); 352 } 353 354 /* 355 * Iterate all reverse mappings to find (1) the gaps between rmap records (all 356 * unowned space), (2) the OWN_AG extents (which encompass the free space 357 * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL 358 * blocks. The free space is (1) + (2) - (3) - (4). 359 */ 360 STATIC int 361 xrep_abt_find_freespace( 362 struct xrep_abt *ra) 363 { 364 struct xfs_scrub *sc = ra->sc; 365 struct xfs_mount *mp = sc->mp; 366 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 367 struct xfs_buf *agfl_bp; 368 xfs_agblock_t agend; 369 int error; 370 371 xagb_bitmap_init(&ra->not_allocbt_blocks); 372 373 xrep_ag_btcur_init(sc, &sc->sa); 374 375 /* 376 * Iterate all the reverse mappings to find gaps in the physical 377 * mappings, all the OWN_AG blocks, and all the rmapbt extents. 378 */ 379 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); 380 if (error) 381 goto err; 382 383 /* Insert a record for space between the last rmap and EOAG. */ 384 agend = be32_to_cpu(agf->agf_length); 385 if (ra->next_agbno < agend) { 386 error = xrep_abt_stash(ra, agend); 387 if (error) 388 goto err; 389 } 390 391 /* Collect all the AGFL blocks. */ 392 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); 393 if (error) 394 goto err; 395 396 error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); 397 if (error) 398 goto err_agfl; 399 400 /* Compute the old bnobt/cntbt blocks. */ 401 error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, 402 &ra->not_allocbt_blocks); 403 if (error) 404 goto err_agfl; 405 406 ra->nr_real_records = xfarray_length(ra->free_records); 407 err_agfl: 408 xfs_trans_brelse(sc->tp, agfl_bp); 409 err: 410 xchk_ag_btcur_free(&sc->sa); 411 xagb_bitmap_destroy(&ra->not_allocbt_blocks); 412 return error; 413 } 414 415 /* 416 * We're going to use the observed free space records to reserve blocks for the 417 * new free space btrees, so we play an iterative game where we try to converge 418 * on the number of blocks we need: 419 * 420 * 1. Estimate how many blocks we'll need to store the records. 421 * 2. If the first free record has more blocks than we need, we're done. 422 * We will have to re-sort the records prior to building the cntbt. 423 * 3. If that record has exactly the number of blocks we need, null out the 424 * record. We're done. 425 * 4. Otherwise, we still need more blocks. Null out the record, subtract its 426 * length from the number of blocks we need, and go back to step 1. 427 * 428 * Fortunately, we don't have to do any transaction work to play this game, so 429 * we don't have to tear down the staging cursors. 430 */ 431 STATIC int 432 xrep_abt_reserve_space( 433 struct xrep_abt *ra, 434 struct xfs_btree_cur *bno_cur, 435 struct xfs_btree_cur *cnt_cur, 436 bool *needs_resort) 437 { 438 struct xfs_scrub *sc = ra->sc; 439 xfarray_idx_t record_nr; 440 unsigned int allocated = 0; 441 int error = 0; 442 443 record_nr = xfarray_length(ra->free_records) - 1; 444 do { 445 struct xfs_alloc_rec_incore arec; 446 uint64_t required; 447 unsigned int desired; 448 unsigned int len; 449 450 /* Compute how many blocks we'll need. */ 451 error = xfs_btree_bload_compute_geometry(cnt_cur, 452 &ra->new_cntbt.bload, ra->nr_real_records); 453 if (error) 454 break; 455 456 error = xfs_btree_bload_compute_geometry(bno_cur, 457 &ra->new_bnobt.bload, ra->nr_real_records); 458 if (error) 459 break; 460 461 /* How many btree blocks do we need to store all records? */ 462 required = ra->new_bnobt.bload.nr_blocks + 463 ra->new_cntbt.bload.nr_blocks; 464 ASSERT(required < INT_MAX); 465 466 /* If we've reserved enough blocks, we're done. */ 467 if (allocated >= required) 468 break; 469 470 desired = required - allocated; 471 472 /* We need space but there's none left; bye! */ 473 if (ra->nr_real_records == 0) { 474 error = -ENOSPC; 475 break; 476 } 477 478 /* Grab the first record from the list. */ 479 error = xfarray_load(ra->free_records, record_nr, &arec); 480 if (error) 481 break; 482 483 ASSERT(arec.ar_blockcount <= UINT_MAX); 484 len = min_t(unsigned int, arec.ar_blockcount, desired); 485 486 trace_xrep_newbt_alloc_ag_blocks(sc->sa.pag, arec.ar_startblock, 487 len, XFS_RMAP_OWN_AG); 488 489 error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, 490 arec.ar_startblock, len); 491 if (error) 492 break; 493 allocated += len; 494 ra->nr_blocks -= len; 495 496 if (arec.ar_blockcount > desired) { 497 /* 498 * Record has more space than we need. The number of 499 * free records doesn't change, so shrink the free 500 * record, inform the caller that the records are no 501 * longer sorted by length, and exit. 502 */ 503 arec.ar_startblock += desired; 504 arec.ar_blockcount -= desired; 505 error = xfarray_store(ra->free_records, record_nr, 506 &arec); 507 if (error) 508 break; 509 510 *needs_resort = true; 511 return 0; 512 } 513 514 /* 515 * We're going to use up the entire record, so unset it and 516 * move on to the next one. This changes the number of free 517 * records (but doesn't break the sorting order), so we must 518 * go around the loop once more to re-run _bload_init. 519 */ 520 error = xfarray_unset(ra->free_records, record_nr); 521 if (error) 522 break; 523 ra->nr_real_records--; 524 record_nr--; 525 } while (1); 526 527 return error; 528 } 529 530 STATIC int 531 xrep_abt_dispose_one( 532 struct xrep_abt *ra, 533 struct xrep_newbt_resv *resv) 534 { 535 struct xfs_scrub *sc = ra->sc; 536 struct xfs_perag *pag = sc->sa.pag; 537 xfs_agblock_t free_agbno = resv->agbno + resv->used; 538 xfs_extlen_t free_aglen = resv->len - resv->used; 539 int error; 540 541 ASSERT(pag == resv->pag); 542 543 /* Add a deferred rmap for each extent we used. */ 544 if (resv->used > 0) 545 xfs_rmap_alloc_extent(sc->tp, false, 546 xfs_agbno_to_fsb(pag, resv->agbno), resv->used, 547 XFS_RMAP_OWN_AG); 548 549 /* 550 * For each reserved btree block we didn't use, add it to the free 551 * space btree. We didn't touch fdblocks when we reserved them, so 552 * we don't touch it now. 553 */ 554 if (free_aglen == 0) 555 return 0; 556 557 trace_xrep_newbt_free_blocks(resv->pag, free_agbno, free_aglen, 558 ra->new_bnobt.oinfo.oi_owner); 559 560 error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, 561 &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); 562 if (error) 563 return error; 564 565 return xrep_defer_finish(sc); 566 } 567 568 /* 569 * Deal with all the space we reserved. Blocks that were allocated for the 570 * free space btrees need to have a (deferred) rmap added for the OWN_AG 571 * allocation, and blocks that didn't get used can be freed via the usual 572 * (deferred) means. 573 */ 574 STATIC void 575 xrep_abt_dispose_reservations( 576 struct xrep_abt *ra, 577 int error) 578 { 579 struct xrep_newbt_resv *resv, *n; 580 581 if (error) 582 goto junkit; 583 584 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 585 error = xrep_abt_dispose_one(ra, resv); 586 if (error) 587 goto junkit; 588 } 589 590 junkit: 591 list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { 592 xfs_perag_put(resv->pag); 593 list_del(&resv->list); 594 kfree(resv); 595 } 596 597 xrep_newbt_cancel(&ra->new_bnobt); 598 xrep_newbt_cancel(&ra->new_cntbt); 599 } 600 601 /* Retrieve free space data for bulk load. */ 602 STATIC int 603 xrep_abt_get_records( 604 struct xfs_btree_cur *cur, 605 unsigned int idx, 606 struct xfs_btree_block *block, 607 unsigned int nr_wanted, 608 void *priv) 609 { 610 struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; 611 struct xrep_abt *ra = priv; 612 union xfs_btree_rec *block_rec; 613 unsigned int loaded; 614 int error; 615 616 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { 617 error = xfarray_load_next(ra->free_records, &ra->array_cur, 618 arec); 619 if (error) 620 return error; 621 622 ra->longest = max(ra->longest, arec->ar_blockcount); 623 624 block_rec = xfs_btree_rec_addr(cur, idx, block); 625 cur->bc_ops->init_rec_from_cur(cur, block_rec); 626 } 627 628 return loaded; 629 } 630 631 /* Feed one of the new btree blocks to the bulk loader. */ 632 STATIC int 633 xrep_abt_claim_block( 634 struct xfs_btree_cur *cur, 635 union xfs_btree_ptr *ptr, 636 void *priv) 637 { 638 struct xrep_abt *ra = priv; 639 640 return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); 641 } 642 643 /* 644 * Reset the AGF counters to reflect the free space btrees that we just 645 * rebuilt, then reinitialize the per-AG data. 646 */ 647 STATIC int 648 xrep_abt_reset_counters( 649 struct xrep_abt *ra) 650 { 651 struct xfs_scrub *sc = ra->sc; 652 struct xfs_perag *pag = sc->sa.pag; 653 struct xfs_agf *agf = sc->sa.agf_bp->b_addr; 654 unsigned int freesp_btreeblks = 0; 655 656 /* 657 * Compute the contribution to agf_btreeblks for the new free space 658 * btrees. This is the computed btree size minus anything we didn't 659 * use. 660 */ 661 freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; 662 freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; 663 664 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); 665 freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); 666 667 /* 668 * The AGF header contains extra information related to the free space 669 * btrees, so we must update those fields here. 670 */ 671 agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + 672 (be32_to_cpu(agf->agf_rmap_blocks) - 1)); 673 agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); 674 agf->agf_longest = cpu_to_be32(ra->longest); 675 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | 676 XFS_AGF_LONGEST | 677 XFS_AGF_FREEBLKS); 678 679 /* 680 * After we commit the new btree to disk, it is possible that the 681 * process to reap the old btree blocks will race with the AIL trying 682 * to checkpoint the old btree blocks into the filesystem. If the new 683 * tree is shorter than the old one, the allocbt write verifier will 684 * fail and the AIL will shut down the filesystem. 685 * 686 * To avoid this, save the old incore btree height values as the alt 687 * height values before re-initializing the perag info from the updated 688 * AGF to capture all the new values. 689 */ 690 pag->pagf_repair_bno_level = pag->pagf_bno_level; 691 pag->pagf_repair_cnt_level = pag->pagf_cnt_level; 692 693 /* Reinitialize with the values we just logged. */ 694 return xrep_reinit_pagf(sc); 695 } 696 697 /* 698 * Use the collected free space information to stage new free space btrees. 699 * If this is successful we'll return with the new btree root 700 * information logged to the repair transaction but not yet committed. 701 */ 702 STATIC int 703 xrep_abt_build_new_trees( 704 struct xrep_abt *ra) 705 { 706 struct xfs_scrub *sc = ra->sc; 707 struct xfs_btree_cur *bno_cur; 708 struct xfs_btree_cur *cnt_cur; 709 struct xfs_perag *pag = sc->sa.pag; 710 bool needs_resort = false; 711 int error; 712 713 /* 714 * Sort the free extents by length so that we can set up the free space 715 * btrees in as few extents as possible. This reduces the amount of 716 * deferred rmap / free work we have to do at the end. 717 */ 718 error = xrep_cntbt_sort_records(ra, false); 719 if (error) 720 return error; 721 722 /* 723 * Prepare to construct the new btree by reserving disk space for the 724 * new btree and setting up all the accounting information we'll need 725 * to root the new btree while it's under construction and before we 726 * attach it to the AG header. 727 */ 728 xrep_newbt_init_bare(&ra->new_bnobt, sc); 729 xrep_newbt_init_bare(&ra->new_cntbt, sc); 730 731 ra->new_bnobt.bload.get_records = xrep_abt_get_records; 732 ra->new_cntbt.bload.get_records = xrep_abt_get_records; 733 734 ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; 735 ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; 736 737 /* Allocate cursors for the staged btrees. */ 738 bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); 739 xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); 740 741 cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); 742 xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); 743 744 /* Last chance to abort before we start committing fixes. */ 745 if (xchk_should_terminate(sc, &error)) 746 goto err_cur; 747 748 /* Reserve the space we'll need for the new btrees. */ 749 error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); 750 if (error) 751 goto err_cur; 752 753 /* 754 * If we need to re-sort the free extents by length, do so so that we 755 * can put the records into the cntbt in the correct order. 756 */ 757 if (needs_resort) { 758 error = xrep_cntbt_sort_records(ra, needs_resort); 759 if (error) 760 goto err_cur; 761 } 762 763 /* 764 * Due to btree slack factors, it's possible for a new btree to be one 765 * level taller than the old btree. Update the alternate incore btree 766 * height so that we don't trip the verifiers when writing the new 767 * btree blocks to disk. 768 */ 769 pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; 770 pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; 771 772 /* Load the free space by length tree. */ 773 ra->array_cur = XFARRAY_CURSOR_INIT; 774 ra->longest = 0; 775 error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); 776 if (error) 777 goto err_levels; 778 779 error = xrep_bnobt_sort_records(ra); 780 if (error) 781 goto err_levels; 782 783 /* Load the free space by block number tree. */ 784 ra->array_cur = XFARRAY_CURSOR_INIT; 785 error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); 786 if (error) 787 goto err_levels; 788 789 /* 790 * Install the new btrees in the AG header. After this point the old 791 * btrees are no longer accessible and the new trees are live. 792 */ 793 xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); 794 xfs_btree_del_cursor(bno_cur, 0); 795 xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); 796 xfs_btree_del_cursor(cnt_cur, 0); 797 798 /* Reset the AGF counters now that we've changed the btree shape. */ 799 error = xrep_abt_reset_counters(ra); 800 if (error) 801 goto err_newbt; 802 803 /* Dispose of any unused blocks and the accounting information. */ 804 xrep_abt_dispose_reservations(ra, error); 805 806 return xrep_roll_ag_trans(sc); 807 808 err_levels: 809 pag->pagf_repair_bno_level = 0; 810 pag->pagf_repair_cnt_level = 0; 811 err_cur: 812 xfs_btree_del_cursor(cnt_cur, error); 813 xfs_btree_del_cursor(bno_cur, error); 814 err_newbt: 815 xrep_abt_dispose_reservations(ra, error); 816 return error; 817 } 818 819 /* 820 * Now that we've logged the roots of the new btrees, invalidate all of the 821 * old blocks and free them. 822 */ 823 STATIC int 824 xrep_abt_remove_old_trees( 825 struct xrep_abt *ra) 826 { 827 struct xfs_perag *pag = ra->sc->sa.pag; 828 int error; 829 830 /* Free the old btree blocks if they're not in use. */ 831 error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, 832 &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); 833 if (error) 834 return error; 835 836 /* 837 * Now that we've zapped all the old allocbt blocks we can turn off 838 * the alternate height mechanism. 839 */ 840 pag->pagf_repair_bno_level = 0; 841 pag->pagf_repair_cnt_level = 0; 842 return 0; 843 } 844 845 /* Repair the freespace btrees for some AG. */ 846 int 847 xrep_allocbt( 848 struct xfs_scrub *sc) 849 { 850 struct xrep_abt *ra; 851 struct xfs_mount *mp = sc->mp; 852 unsigned int busy_gen; 853 char *descr; 854 int error; 855 856 /* We require the rmapbt to rebuild anything. */ 857 if (!xfs_has_rmapbt(mp)) 858 return -EOPNOTSUPP; 859 860 ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); 861 if (!ra) 862 return -ENOMEM; 863 ra->sc = sc; 864 865 /* We rebuild both data structures. */ 866 sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; 867 868 /* 869 * Make sure the busy extent list is clear because we can't put extents 870 * on there twice. In theory we cleared this before we started, but 871 * let's not risk the filesystem. 872 */ 873 if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) { 874 error = -EDEADLOCK; 875 goto out_ra; 876 } 877 878 /* Set up enough storage to handle maximally fragmented free space. */ 879 descr = xchk_xfile_ag_descr(sc, "free space records"); 880 error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, 881 sizeof(struct xfs_alloc_rec_incore), 882 &ra->free_records); 883 kfree(descr); 884 if (error) 885 goto out_ra; 886 887 /* Collect the free space data and find the old btree blocks. */ 888 xagb_bitmap_init(&ra->old_allocbt_blocks); 889 error = xrep_abt_find_freespace(ra); 890 if (error) 891 goto out_bitmap; 892 893 /* Rebuild the free space information. */ 894 error = xrep_abt_build_new_trees(ra); 895 if (error) 896 goto out_bitmap; 897 898 /* Kill the old trees. */ 899 error = xrep_abt_remove_old_trees(ra); 900 if (error) 901 goto out_bitmap; 902 903 out_bitmap: 904 xagb_bitmap_destroy(&ra->old_allocbt_blocks); 905 xfarray_destroy(ra->free_records); 906 out_ra: 907 kfree(ra); 908 return error; 909 } 910 911 /* Make sure both btrees are ok after we've rebuilt them. */ 912 int 913 xrep_revalidate_allocbt( 914 struct xfs_scrub *sc) 915 { 916 __u32 old_type = sc->sm->sm_type; 917 int error; 918 919 /* 920 * We must update sm_type temporarily so that the tree-to-tree cross 921 * reference checks will work in the correct direction, and also so 922 * that tracing will report correctly if there are more errors. 923 */ 924 sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; 925 error = xchk_allocbt(sc); 926 if (error) 927 goto out; 928 929 sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; 930 error = xchk_allocbt(sc); 931 out: 932 sc->sm->sm_type = old_type; 933 return error; 934 } 935