1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 #include "xfs_health.h" 30 #include "xfs_extfree_item.h" 31 32 struct kmem_cache *xfs_extfree_item_cache; 33 34 struct workqueue_struct *xfs_alloc_wq; 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 /* 40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 41 * the beginning of the block for a proper header with the location information 42 * and CRC. 43 */ 44 unsigned int 45 xfs_agfl_size( 46 struct xfs_mount *mp) 47 { 48 unsigned int size = mp->m_sb.sb_sectsize; 49 50 if (xfs_has_crc(mp)) 51 size -= sizeof(struct xfs_agfl); 52 53 return size / sizeof(xfs_agblock_t); 54 } 55 56 unsigned int 57 xfs_refc_block( 58 struct xfs_mount *mp) 59 { 60 if (xfs_has_rmapbt(mp)) 61 return XFS_RMAP_BLOCK(mp) + 1; 62 if (xfs_has_finobt(mp)) 63 return XFS_FIBT_BLOCK(mp) + 1; 64 return XFS_IBT_BLOCK(mp) + 1; 65 } 66 67 xfs_extlen_t 68 xfs_prealloc_blocks( 69 struct xfs_mount *mp) 70 { 71 if (xfs_has_reflink(mp)) 72 return xfs_refc_block(mp) + 1; 73 if (xfs_has_rmapbt(mp)) 74 return XFS_RMAP_BLOCK(mp) + 1; 75 if (xfs_has_finobt(mp)) 76 return XFS_FIBT_BLOCK(mp) + 1; 77 return XFS_IBT_BLOCK(mp) + 1; 78 } 79 80 /* 81 * The number of blocks per AG that we withhold from xfs_dec_fdblocks to 82 * guarantee that we can refill the AGFL prior to allocating space in a nearly 83 * full AG. Although the space described by the free space btrees, the 84 * blocks used by the freesp btrees themselves, and the blocks owned by the 85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 86 * free space in the AG drop so low that the free space btrees cannot refill an 87 * empty AGFL up to the minimum level. Rather than grind through empty AGs 88 * until the fs goes down, we subtract this many AG blocks from the incore 89 * fdblocks to ensure user allocation does not overcommit the space the 90 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 91 * withhold space from xfs_dec_fdblocks, so we do not account for that here. 92 */ 93 #define XFS_ALLOCBT_AGFL_RESERVE 4 94 95 /* 96 * Compute the number of blocks that we set aside to guarantee the ability to 97 * refill the AGFL and handle a full bmap btree split. 98 * 99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 100 * AGF buffer (PV 947395), we place constraints on the relationship among 101 * actual allocations for data blocks, freelist blocks, and potential file data 102 * bmap btree blocks. However, these restrictions may result in no actual space 103 * allocated for a delayed extent, for example, a data block in a certain AG is 104 * allocated but there is no additional block for the additional bmap btree 105 * block due to a split of the bmap btree of the file. The result of this may 106 * lead to an infinite loop when the file gets flushed to disk and all delayed 107 * extents need to be actually allocated. To get around this, we explicitly set 108 * aside a few blocks which will not be reserved in delayed allocation. 109 * 110 * For each AG, we need to reserve enough blocks to replenish a totally empty 111 * AGFL and 4 more to handle a potential split of the file's bmap btree. 112 */ 113 unsigned int 114 xfs_alloc_set_aside( 115 struct xfs_mount *mp) 116 { 117 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 118 } 119 120 /* 121 * When deciding how much space to allocate out of an AG, we limit the 122 * allocation maximum size to the size the AG. However, we cannot use all the 123 * blocks in the AG - some are permanently used by metadata. These 124 * blocks are generally: 125 * - the AG superblock, AGF, AGI and AGFL 126 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 127 * the AGI free inode and rmap btree root blocks. 128 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 129 * - the rmapbt root block 130 * 131 * The AG headers are sector sized, so the amount of space they take up is 132 * dependent on filesystem geometry. The others are all single blocks. 133 */ 134 unsigned int 135 xfs_alloc_ag_max_usable( 136 struct xfs_mount *mp) 137 { 138 unsigned int blocks; 139 140 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 141 blocks += XFS_ALLOCBT_AGFL_RESERVE; 142 blocks += 3; /* AGF, AGI btree root blocks */ 143 if (xfs_has_finobt(mp)) 144 blocks++; /* finobt root block */ 145 if (xfs_has_rmapbt(mp)) 146 blocks++; /* rmap root block */ 147 if (xfs_has_reflink(mp)) 148 blocks++; /* refcount root block */ 149 150 return mp->m_sb.sb_agblocks - blocks; 151 } 152 153 154 static int 155 xfs_alloc_lookup( 156 struct xfs_btree_cur *cur, 157 xfs_lookup_t dir, 158 xfs_agblock_t bno, 159 xfs_extlen_t len, 160 int *stat) 161 { 162 int error; 163 164 cur->bc_rec.a.ar_startblock = bno; 165 cur->bc_rec.a.ar_blockcount = len; 166 error = xfs_btree_lookup(cur, dir, stat); 167 if (*stat == 1) 168 cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 169 else 170 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 171 return error; 172 } 173 174 /* 175 * Lookup the record equal to [bno, len] in the btree given by cur. 176 */ 177 static inline int /* error */ 178 xfs_alloc_lookup_eq( 179 struct xfs_btree_cur *cur, /* btree cursor */ 180 xfs_agblock_t bno, /* starting block of extent */ 181 xfs_extlen_t len, /* length of extent */ 182 int *stat) /* success/failure */ 183 { 184 return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); 185 } 186 187 /* 188 * Lookup the first record greater than or equal to [bno, len] 189 * in the btree given by cur. 190 */ 191 int /* error */ 192 xfs_alloc_lookup_ge( 193 struct xfs_btree_cur *cur, /* btree cursor */ 194 xfs_agblock_t bno, /* starting block of extent */ 195 xfs_extlen_t len, /* length of extent */ 196 int *stat) /* success/failure */ 197 { 198 return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); 199 } 200 201 /* 202 * Lookup the first record less than or equal to [bno, len] 203 * in the btree given by cur. 204 */ 205 int /* error */ 206 xfs_alloc_lookup_le( 207 struct xfs_btree_cur *cur, /* btree cursor */ 208 xfs_agblock_t bno, /* starting block of extent */ 209 xfs_extlen_t len, /* length of extent */ 210 int *stat) /* success/failure */ 211 { 212 return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); 213 } 214 215 static inline bool 216 xfs_alloc_cur_active( 217 struct xfs_btree_cur *cur) 218 { 219 return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); 220 } 221 222 /* 223 * Update the record referred to by cur to the value given 224 * by [bno, len]. 225 * This either works (return 0) or gets an EFSCORRUPTED error. 226 */ 227 STATIC int /* error */ 228 xfs_alloc_update( 229 struct xfs_btree_cur *cur, /* btree cursor */ 230 xfs_agblock_t bno, /* starting block of extent */ 231 xfs_extlen_t len) /* length of extent */ 232 { 233 union xfs_btree_rec rec; 234 235 rec.alloc.ar_startblock = cpu_to_be32(bno); 236 rec.alloc.ar_blockcount = cpu_to_be32(len); 237 return xfs_btree_update(cur, &rec); 238 } 239 240 /* Convert the ondisk btree record to its incore representation. */ 241 void 242 xfs_alloc_btrec_to_irec( 243 const union xfs_btree_rec *rec, 244 struct xfs_alloc_rec_incore *irec) 245 { 246 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 247 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 248 } 249 250 /* Simple checks for free space records. */ 251 xfs_failaddr_t 252 xfs_alloc_check_irec( 253 struct xfs_perag *pag, 254 const struct xfs_alloc_rec_incore *irec) 255 { 256 if (irec->ar_blockcount == 0) 257 return __this_address; 258 259 /* check for valid extent range, including overflow */ 260 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) 261 return __this_address; 262 263 return NULL; 264 } 265 266 static inline int 267 xfs_alloc_complain_bad_rec( 268 struct xfs_btree_cur *cur, 269 xfs_failaddr_t fa, 270 const struct xfs_alloc_rec_incore *irec) 271 { 272 struct xfs_mount *mp = cur->bc_mp; 273 274 xfs_warn(mp, 275 "%sbt record corruption in AG %d detected at %pS!", 276 cur->bc_ops->name, cur->bc_group->xg_gno, fa); 277 xfs_warn(mp, 278 "start block 0x%x block count 0x%x", irec->ar_startblock, 279 irec->ar_blockcount); 280 xfs_btree_mark_sick(cur); 281 return -EFSCORRUPTED; 282 } 283 284 /* 285 * Get the data from the pointed-to record. 286 */ 287 int /* error */ 288 xfs_alloc_get_rec( 289 struct xfs_btree_cur *cur, /* btree cursor */ 290 xfs_agblock_t *bno, /* output: starting block of extent */ 291 xfs_extlen_t *len, /* output: length of extent */ 292 int *stat) /* output: success/failure */ 293 { 294 struct xfs_alloc_rec_incore irec; 295 union xfs_btree_rec *rec; 296 xfs_failaddr_t fa; 297 int error; 298 299 error = xfs_btree_get_rec(cur, &rec, stat); 300 if (error || !(*stat)) 301 return error; 302 303 xfs_alloc_btrec_to_irec(rec, &irec); 304 fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); 305 if (fa) 306 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 307 308 *bno = irec.ar_startblock; 309 *len = irec.ar_blockcount; 310 return 0; 311 } 312 313 /* 314 * Compute aligned version of the found extent. 315 * Takes alignment and min length into account. 316 */ 317 STATIC bool 318 xfs_alloc_compute_aligned( 319 xfs_alloc_arg_t *args, /* allocation argument structure */ 320 xfs_agblock_t foundbno, /* starting block in found extent */ 321 xfs_extlen_t foundlen, /* length in found extent */ 322 xfs_agblock_t *resbno, /* result block number */ 323 xfs_extlen_t *reslen, /* result length */ 324 unsigned *busy_gen) 325 { 326 xfs_agblock_t bno = foundbno; 327 xfs_extlen_t len = foundlen; 328 xfs_extlen_t diff; 329 bool busy; 330 331 /* Trim busy sections out of found extent */ 332 busy = xfs_extent_busy_trim(pag_group(args->pag), args->minlen, 333 args->maxlen, &bno, &len, busy_gen); 334 335 /* 336 * If we have a largish extent that happens to start before min_agbno, 337 * see if we can shift it into range... 338 */ 339 if (bno < args->min_agbno && bno + len > args->min_agbno) { 340 diff = args->min_agbno - bno; 341 if (len > diff) { 342 bno += diff; 343 len -= diff; 344 } 345 } 346 347 if (args->alignment > 1 && len >= args->minlen) { 348 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 349 350 diff = aligned_bno - bno; 351 352 *resbno = aligned_bno; 353 *reslen = diff >= len ? 0 : len - diff; 354 } else { 355 *resbno = bno; 356 *reslen = len; 357 } 358 359 return busy; 360 } 361 362 /* 363 * Compute best start block and diff for "near" allocations. 364 * freelen >= wantlen already checked by caller. 365 */ 366 STATIC xfs_extlen_t /* difference value (absolute) */ 367 xfs_alloc_compute_diff( 368 xfs_agblock_t wantbno, /* target starting block */ 369 xfs_extlen_t wantlen, /* target length */ 370 xfs_extlen_t alignment, /* target alignment */ 371 int datatype, /* are we allocating data? */ 372 xfs_agblock_t freebno, /* freespace's starting block */ 373 xfs_extlen_t freelen, /* freespace's length */ 374 xfs_agblock_t *newbnop) /* result: best start block from free */ 375 { 376 xfs_agblock_t freeend; /* end of freespace extent */ 377 xfs_agblock_t newbno1; /* return block number */ 378 xfs_agblock_t newbno2; /* other new block number */ 379 xfs_extlen_t newlen1=0; /* length with newbno1 */ 380 xfs_extlen_t newlen2=0; /* length with newbno2 */ 381 xfs_agblock_t wantend; /* end of target extent */ 382 bool userdata = datatype & XFS_ALLOC_USERDATA; 383 384 ASSERT(freelen >= wantlen); 385 freeend = freebno + freelen; 386 wantend = wantbno + wantlen; 387 /* 388 * We want to allocate from the start of a free extent if it is past 389 * the desired block or if we are allocating user data and the free 390 * extent is before desired block. The second case is there to allow 391 * for contiguous allocation from the remaining free space if the file 392 * grows in the short term. 393 */ 394 if (freebno >= wantbno || (userdata && freeend < wantend)) { 395 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 396 newbno1 = NULLAGBLOCK; 397 } else if (freeend >= wantend && alignment > 1) { 398 newbno1 = roundup(wantbno, alignment); 399 newbno2 = newbno1 - alignment; 400 if (newbno1 >= freeend) 401 newbno1 = NULLAGBLOCK; 402 else 403 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 404 if (newbno2 < freebno) 405 newbno2 = NULLAGBLOCK; 406 else 407 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 408 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 409 if (newlen1 < newlen2 || 410 (newlen1 == newlen2 && 411 abs_diff(newbno1, wantbno) > 412 abs_diff(newbno2, wantbno))) 413 newbno1 = newbno2; 414 } else if (newbno2 != NULLAGBLOCK) 415 newbno1 = newbno2; 416 } else if (freeend >= wantend) { 417 newbno1 = wantbno; 418 } else if (alignment > 1) { 419 newbno1 = roundup(freeend - wantlen, alignment); 420 if (newbno1 > freeend - wantlen && 421 newbno1 - alignment >= freebno) 422 newbno1 -= alignment; 423 else if (newbno1 >= freeend) 424 newbno1 = NULLAGBLOCK; 425 } else 426 newbno1 = freeend - wantlen; 427 *newbnop = newbno1; 428 return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); 429 } 430 431 /* 432 * Fix up the length, based on mod and prod. 433 * len should be k * prod + mod for some k. 434 * If len is too small it is returned unchanged. 435 * If len hits maxlen it is left alone. 436 */ 437 STATIC void 438 xfs_alloc_fix_len( 439 xfs_alloc_arg_t *args) /* allocation argument structure */ 440 { 441 xfs_extlen_t k; 442 xfs_extlen_t rlen; 443 444 ASSERT(args->mod < args->prod); 445 rlen = args->len; 446 ASSERT(rlen >= args->minlen); 447 ASSERT(rlen <= args->maxlen); 448 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 449 (args->mod == 0 && rlen < args->prod)) 450 return; 451 k = rlen % args->prod; 452 if (k == args->mod) 453 return; 454 if (k > args->mod) 455 rlen = rlen - (k - args->mod); 456 else 457 rlen = rlen - args->prod + (args->mod - k); 458 /* casts to (int) catch length underflows */ 459 if ((int)rlen < (int)args->minlen) 460 return; 461 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 462 ASSERT(rlen % args->prod == args->mod); 463 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 464 rlen + args->minleft); 465 args->len = rlen; 466 } 467 468 /* 469 * Determine if the cursor points to the block that contains the right-most 470 * block of records in the by-count btree. This block contains the largest 471 * contiguous free extent in the AG, so if we modify a record in this block we 472 * need to call xfs_alloc_fixup_longest() once the modifications are done to 473 * ensure the agf->agf_longest field is kept up to date with the longest free 474 * extent tracked by the by-count btree. 475 */ 476 static bool 477 xfs_alloc_cursor_at_lastrec( 478 struct xfs_btree_cur *cnt_cur) 479 { 480 struct xfs_btree_block *block; 481 union xfs_btree_ptr ptr; 482 struct xfs_buf *bp; 483 484 block = xfs_btree_get_block(cnt_cur, 0, &bp); 485 486 xfs_btree_get_sibling(cnt_cur, block, &ptr, XFS_BB_RIGHTSIB); 487 return xfs_btree_ptr_is_null(cnt_cur, &ptr); 488 } 489 490 /* 491 * Find the rightmost record of the cntbt, and return the longest free space 492 * recorded in it. Simply set both the block number and the length to their 493 * maximum values before searching. 494 */ 495 static int 496 xfs_cntbt_longest( 497 struct xfs_btree_cur *cnt_cur, 498 xfs_extlen_t *longest) 499 { 500 struct xfs_alloc_rec_incore irec; 501 union xfs_btree_rec *rec; 502 int stat = 0; 503 int error; 504 505 memset(&cnt_cur->bc_rec, 0xFF, sizeof(cnt_cur->bc_rec)); 506 error = xfs_btree_lookup(cnt_cur, XFS_LOOKUP_LE, &stat); 507 if (error) 508 return error; 509 if (!stat) { 510 /* totally empty tree */ 511 *longest = 0; 512 return 0; 513 } 514 515 error = xfs_btree_get_rec(cnt_cur, &rec, &stat); 516 if (error) 517 return error; 518 if (XFS_IS_CORRUPT(cnt_cur->bc_mp, !stat)) { 519 xfs_btree_mark_sick(cnt_cur); 520 return -EFSCORRUPTED; 521 } 522 523 xfs_alloc_btrec_to_irec(rec, &irec); 524 *longest = irec.ar_blockcount; 525 return 0; 526 } 527 528 /* 529 * Update the longest contiguous free extent in the AG from the by-count cursor 530 * that is passed to us. This should be done at the end of any allocation or 531 * freeing operation that touches the longest extent in the btree. 532 * 533 * Needing to update the longest extent can be determined by calling 534 * xfs_alloc_cursor_at_lastrec() after the cursor is positioned for record 535 * modification but before the modification begins. 536 */ 537 static int 538 xfs_alloc_fixup_longest( 539 struct xfs_btree_cur *cnt_cur) 540 { 541 struct xfs_perag *pag = to_perag(cnt_cur->bc_group); 542 struct xfs_buf *bp = cnt_cur->bc_ag.agbp; 543 struct xfs_agf *agf = bp->b_addr; 544 xfs_extlen_t longest = 0; 545 int error; 546 547 /* Lookup last rec in order to update AGF. */ 548 error = xfs_cntbt_longest(cnt_cur, &longest); 549 if (error) 550 return error; 551 552 pag->pagf_longest = longest; 553 agf->agf_longest = cpu_to_be32(pag->pagf_longest); 554 xfs_alloc_log_agf(cnt_cur->bc_tp, bp, XFS_AGF_LONGEST); 555 556 return 0; 557 } 558 559 /* 560 * Update the two btrees, logically removing from freespace the extent 561 * starting at rbno, rlen blocks. The extent is contained within the 562 * actual (current) free extent fbno for flen blocks. 563 * Flags are passed in indicating whether the cursors are set to the 564 * relevant records. 565 */ 566 STATIC int /* error code */ 567 xfs_alloc_fixup_trees( 568 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 569 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 570 xfs_agblock_t fbno, /* starting block of free extent */ 571 xfs_extlen_t flen, /* length of free extent */ 572 xfs_agblock_t rbno, /* starting block of returned extent */ 573 xfs_extlen_t rlen, /* length of returned extent */ 574 int flags) /* flags, XFSA_FIXUP_... */ 575 { 576 int error; /* error code */ 577 int i; /* operation results */ 578 xfs_agblock_t nfbno1; /* first new free startblock */ 579 xfs_agblock_t nfbno2; /* second new free startblock */ 580 xfs_extlen_t nflen1=0; /* first new free length */ 581 xfs_extlen_t nflen2=0; /* second new free length */ 582 struct xfs_mount *mp; 583 bool fixup_longest = false; 584 585 mp = cnt_cur->bc_mp; 586 587 /* 588 * Look up the record in the by-size tree if necessary. 589 */ 590 if (flags & XFSA_FIXUP_CNT_OK) { 591 #ifdef DEBUG 592 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 593 return error; 594 if (XFS_IS_CORRUPT(mp, 595 i != 1 || 596 nfbno1 != fbno || 597 nflen1 != flen)) { 598 xfs_btree_mark_sick(cnt_cur); 599 return -EFSCORRUPTED; 600 } 601 #endif 602 } else { 603 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 604 return error; 605 if (XFS_IS_CORRUPT(mp, i != 1)) { 606 xfs_btree_mark_sick(cnt_cur); 607 return -EFSCORRUPTED; 608 } 609 } 610 /* 611 * Look up the record in the by-block tree if necessary. 612 */ 613 if (flags & XFSA_FIXUP_BNO_OK) { 614 #ifdef DEBUG 615 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 616 return error; 617 if (XFS_IS_CORRUPT(mp, 618 i != 1 || 619 nfbno1 != fbno || 620 nflen1 != flen)) { 621 xfs_btree_mark_sick(bno_cur); 622 return -EFSCORRUPTED; 623 } 624 #endif 625 } else { 626 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 627 return error; 628 if (XFS_IS_CORRUPT(mp, i != 1)) { 629 xfs_btree_mark_sick(bno_cur); 630 return -EFSCORRUPTED; 631 } 632 } 633 634 #ifdef DEBUG 635 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 636 struct xfs_btree_block *bnoblock; 637 struct xfs_btree_block *cntblock; 638 639 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 640 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 641 642 if (XFS_IS_CORRUPT(mp, 643 bnoblock->bb_numrecs != 644 cntblock->bb_numrecs)) { 645 xfs_btree_mark_sick(bno_cur); 646 return -EFSCORRUPTED; 647 } 648 } 649 #endif 650 651 /* 652 * Deal with all four cases: the allocated record is contained 653 * within the freespace record, so we can have new freespace 654 * at either (or both) end, or no freespace remaining. 655 */ 656 if (rbno == fbno && rlen == flen) 657 nfbno1 = nfbno2 = NULLAGBLOCK; 658 else if (rbno == fbno) { 659 nfbno1 = rbno + rlen; 660 nflen1 = flen - rlen; 661 nfbno2 = NULLAGBLOCK; 662 } else if (rbno + rlen == fbno + flen) { 663 nfbno1 = fbno; 664 nflen1 = flen - rlen; 665 nfbno2 = NULLAGBLOCK; 666 } else { 667 nfbno1 = fbno; 668 nflen1 = rbno - fbno; 669 nfbno2 = rbno + rlen; 670 nflen2 = (fbno + flen) - nfbno2; 671 } 672 673 if (xfs_alloc_cursor_at_lastrec(cnt_cur)) 674 fixup_longest = true; 675 676 /* 677 * Delete the entry from the by-size btree. 678 */ 679 if ((error = xfs_btree_delete(cnt_cur, &i))) 680 return error; 681 if (XFS_IS_CORRUPT(mp, i != 1)) { 682 xfs_btree_mark_sick(cnt_cur); 683 return -EFSCORRUPTED; 684 } 685 /* 686 * Add new by-size btree entry(s). 687 */ 688 if (nfbno1 != NULLAGBLOCK) { 689 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 690 return error; 691 if (XFS_IS_CORRUPT(mp, i != 0)) { 692 xfs_btree_mark_sick(cnt_cur); 693 return -EFSCORRUPTED; 694 } 695 if ((error = xfs_btree_insert(cnt_cur, &i))) 696 return error; 697 if (XFS_IS_CORRUPT(mp, i != 1)) { 698 xfs_btree_mark_sick(cnt_cur); 699 return -EFSCORRUPTED; 700 } 701 } 702 if (nfbno2 != NULLAGBLOCK) { 703 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 704 return error; 705 if (XFS_IS_CORRUPT(mp, i != 0)) { 706 xfs_btree_mark_sick(cnt_cur); 707 return -EFSCORRUPTED; 708 } 709 if ((error = xfs_btree_insert(cnt_cur, &i))) 710 return error; 711 if (XFS_IS_CORRUPT(mp, i != 1)) { 712 xfs_btree_mark_sick(cnt_cur); 713 return -EFSCORRUPTED; 714 } 715 } 716 /* 717 * Fix up the by-block btree entry(s). 718 */ 719 if (nfbno1 == NULLAGBLOCK) { 720 /* 721 * No remaining freespace, just delete the by-block tree entry. 722 */ 723 if ((error = xfs_btree_delete(bno_cur, &i))) 724 return error; 725 if (XFS_IS_CORRUPT(mp, i != 1)) { 726 xfs_btree_mark_sick(bno_cur); 727 return -EFSCORRUPTED; 728 } 729 } else { 730 /* 731 * Update the by-block entry to start later|be shorter. 732 */ 733 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 734 return error; 735 } 736 if (nfbno2 != NULLAGBLOCK) { 737 /* 738 * 2 resulting free entries, need to add one. 739 */ 740 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 741 return error; 742 if (XFS_IS_CORRUPT(mp, i != 0)) { 743 xfs_btree_mark_sick(bno_cur); 744 return -EFSCORRUPTED; 745 } 746 if ((error = xfs_btree_insert(bno_cur, &i))) 747 return error; 748 if (XFS_IS_CORRUPT(mp, i != 1)) { 749 xfs_btree_mark_sick(bno_cur); 750 return -EFSCORRUPTED; 751 } 752 } 753 754 if (fixup_longest) 755 return xfs_alloc_fixup_longest(cnt_cur); 756 757 return 0; 758 } 759 760 /* 761 * We do not verify the AGFL contents against AGF-based index counters here, 762 * even though we may have access to the perag that contains shadow copies. We 763 * don't know if the AGF based counters have been checked, and if they have they 764 * still may be inconsistent because they haven't yet been reset on the first 765 * allocation after the AGF has been read in. 766 * 767 * This means we can only check that all agfl entries contain valid or null 768 * values because we can't reliably determine the active range to exclude 769 * NULLAGBNO as a valid value. 770 * 771 * However, we can't even do that for v4 format filesystems because there are 772 * old versions of mkfs out there that does not initialise the AGFL to known, 773 * verifiable values. HEnce we can't tell the difference between a AGFL block 774 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. 775 * 776 * As a result, we can only fully validate AGFL block numbers when we pull them 777 * from the freelist in xfs_alloc_get_freelist(). 778 */ 779 static xfs_failaddr_t 780 xfs_agfl_verify( 781 struct xfs_buf *bp) 782 { 783 struct xfs_mount *mp = bp->b_mount; 784 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 785 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 786 int i; 787 788 if (!xfs_has_crc(mp)) 789 return NULL; 790 791 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 792 return __this_address; 793 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 794 return __this_address; 795 /* 796 * during growfs operations, the perag is not fully initialised, 797 * so we can't use it for any useful checking. growfs ensures we can't 798 * use it by using uncached buffers that don't have the perag attached 799 * so we can detect and avoid this problem. 800 */ 801 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != pag_agno((bp->b_pag))) 802 return __this_address; 803 804 for (i = 0; i < xfs_agfl_size(mp); i++) { 805 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 806 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 807 return __this_address; 808 } 809 810 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 811 return __this_address; 812 return NULL; 813 } 814 815 static void 816 xfs_agfl_read_verify( 817 struct xfs_buf *bp) 818 { 819 struct xfs_mount *mp = bp->b_mount; 820 xfs_failaddr_t fa; 821 822 /* 823 * There is no verification of non-crc AGFLs because mkfs does not 824 * initialise the AGFL to zero or NULL. Hence the only valid part of the 825 * AGFL is what the AGF says is active. We can't get to the AGF, so we 826 * can't verify just those entries are valid. 827 */ 828 if (!xfs_has_crc(mp)) 829 return; 830 831 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 832 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 833 else { 834 fa = xfs_agfl_verify(bp); 835 if (fa) 836 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 837 } 838 } 839 840 static void 841 xfs_agfl_write_verify( 842 struct xfs_buf *bp) 843 { 844 struct xfs_mount *mp = bp->b_mount; 845 struct xfs_buf_log_item *bip = bp->b_log_item; 846 xfs_failaddr_t fa; 847 848 /* no verification of non-crc AGFLs */ 849 if (!xfs_has_crc(mp)) 850 return; 851 852 fa = xfs_agfl_verify(bp); 853 if (fa) { 854 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 855 return; 856 } 857 858 if (bip) 859 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 860 861 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 862 } 863 864 const struct xfs_buf_ops xfs_agfl_buf_ops = { 865 .name = "xfs_agfl", 866 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 867 .verify_read = xfs_agfl_read_verify, 868 .verify_write = xfs_agfl_write_verify, 869 .verify_struct = xfs_agfl_verify, 870 }; 871 872 /* 873 * Read in the allocation group free block array. 874 */ 875 int 876 xfs_alloc_read_agfl( 877 struct xfs_perag *pag, 878 struct xfs_trans *tp, 879 struct xfs_buf **bpp) 880 { 881 struct xfs_mount *mp = pag_mount(pag); 882 struct xfs_buf *bp; 883 int error; 884 885 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 886 XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGFL_DADDR(mp)), 887 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 888 if (xfs_metadata_is_sick(error)) 889 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 890 if (error) 891 return error; 892 xfs_buf_set_ref(bp, XFS_AGFL_REF); 893 *bpp = bp; 894 return 0; 895 } 896 897 STATIC int 898 xfs_alloc_update_counters( 899 struct xfs_trans *tp, 900 struct xfs_buf *agbp, 901 long len) 902 { 903 struct xfs_agf *agf = agbp->b_addr; 904 905 agbp->b_pag->pagf_freeblks += len; 906 be32_add_cpu(&agf->agf_freeblks, len); 907 908 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 909 be32_to_cpu(agf->agf_length))) { 910 xfs_buf_mark_corrupt(agbp); 911 xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); 912 return -EFSCORRUPTED; 913 } 914 915 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 916 return 0; 917 } 918 919 /* 920 * Block allocation algorithm and data structures. 921 */ 922 struct xfs_alloc_cur { 923 struct xfs_btree_cur *cnt; /* btree cursors */ 924 struct xfs_btree_cur *bnolt; 925 struct xfs_btree_cur *bnogt; 926 xfs_extlen_t cur_len;/* current search length */ 927 xfs_agblock_t rec_bno;/* extent startblock */ 928 xfs_extlen_t rec_len;/* extent length */ 929 xfs_agblock_t bno; /* alloc bno */ 930 xfs_extlen_t len; /* alloc len */ 931 xfs_extlen_t diff; /* diff from search bno */ 932 unsigned int busy_gen;/* busy state */ 933 bool busy; 934 }; 935 936 /* 937 * Set up cursors, etc. in the extent allocation cursor. This function can be 938 * called multiple times to reset an initialized structure without having to 939 * reallocate cursors. 940 */ 941 static int 942 xfs_alloc_cur_setup( 943 struct xfs_alloc_arg *args, 944 struct xfs_alloc_cur *acur) 945 { 946 int error; 947 int i; 948 949 acur->cur_len = args->maxlen; 950 acur->rec_bno = 0; 951 acur->rec_len = 0; 952 acur->bno = 0; 953 acur->len = 0; 954 acur->diff = -1; 955 acur->busy = false; 956 acur->busy_gen = 0; 957 958 /* 959 * Perform an initial cntbt lookup to check for availability of maxlen 960 * extents. If this fails, we'll return -ENOSPC to signal the caller to 961 * attempt a small allocation. 962 */ 963 if (!acur->cnt) 964 acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, 965 args->agbp, args->pag); 966 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 967 if (error) 968 return error; 969 970 /* 971 * Allocate the bnobt left and right search cursors. 972 */ 973 if (!acur->bnolt) 974 acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, 975 args->agbp, args->pag); 976 if (!acur->bnogt) 977 acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, 978 args->agbp, args->pag); 979 return i == 1 ? 0 : -ENOSPC; 980 } 981 982 static void 983 xfs_alloc_cur_close( 984 struct xfs_alloc_cur *acur, 985 bool error) 986 { 987 int cur_error = XFS_BTREE_NOERROR; 988 989 if (error) 990 cur_error = XFS_BTREE_ERROR; 991 992 if (acur->cnt) 993 xfs_btree_del_cursor(acur->cnt, cur_error); 994 if (acur->bnolt) 995 xfs_btree_del_cursor(acur->bnolt, cur_error); 996 if (acur->bnogt) 997 xfs_btree_del_cursor(acur->bnogt, cur_error); 998 acur->cnt = acur->bnolt = acur->bnogt = NULL; 999 } 1000 1001 /* 1002 * Check an extent for allocation and track the best available candidate in the 1003 * allocation structure. The cursor is deactivated if it has entered an out of 1004 * range state based on allocation arguments. Optionally return the extent 1005 * extent geometry and allocation status if requested by the caller. 1006 */ 1007 static int 1008 xfs_alloc_cur_check( 1009 struct xfs_alloc_arg *args, 1010 struct xfs_alloc_cur *acur, 1011 struct xfs_btree_cur *cur, 1012 int *new) 1013 { 1014 int error, i; 1015 xfs_agblock_t bno, bnoa, bnew; 1016 xfs_extlen_t len, lena, diff = -1; 1017 bool busy; 1018 unsigned busy_gen = 0; 1019 bool deactivate = false; 1020 bool isbnobt = xfs_btree_is_bno(cur->bc_ops); 1021 1022 *new = 0; 1023 1024 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1025 if (error) 1026 return error; 1027 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1028 xfs_btree_mark_sick(cur); 1029 return -EFSCORRUPTED; 1030 } 1031 1032 /* 1033 * Check minlen and deactivate a cntbt cursor if out of acceptable size 1034 * range (i.e., walking backwards looking for a minlen extent). 1035 */ 1036 if (len < args->minlen) { 1037 deactivate = !isbnobt; 1038 goto out; 1039 } 1040 1041 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 1042 &busy_gen); 1043 acur->busy |= busy; 1044 if (busy) 1045 acur->busy_gen = busy_gen; 1046 /* deactivate a bnobt cursor outside of locality range */ 1047 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 1048 deactivate = isbnobt; 1049 goto out; 1050 } 1051 if (lena < args->minlen) 1052 goto out; 1053 1054 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 1055 xfs_alloc_fix_len(args); 1056 ASSERT(args->len >= args->minlen); 1057 if (args->len < acur->len) 1058 goto out; 1059 1060 /* 1061 * We have an aligned record that satisfies minlen and beats or matches 1062 * the candidate extent size. Compare locality for near allocation mode. 1063 */ 1064 diff = xfs_alloc_compute_diff(args->agbno, args->len, 1065 args->alignment, args->datatype, 1066 bnoa, lena, &bnew); 1067 if (bnew == NULLAGBLOCK) 1068 goto out; 1069 1070 /* 1071 * Deactivate a bnobt cursor with worse locality than the current best. 1072 */ 1073 if (diff > acur->diff) { 1074 deactivate = isbnobt; 1075 goto out; 1076 } 1077 1078 ASSERT(args->len > acur->len || 1079 (args->len == acur->len && diff <= acur->diff)); 1080 acur->rec_bno = bno; 1081 acur->rec_len = len; 1082 acur->bno = bnew; 1083 acur->len = args->len; 1084 acur->diff = diff; 1085 *new = 1; 1086 1087 /* 1088 * We're done if we found a perfect allocation. This only deactivates 1089 * the current cursor, but this is just an optimization to terminate a 1090 * cntbt search that otherwise runs to the edge of the tree. 1091 */ 1092 if (acur->diff == 0 && acur->len == args->maxlen) 1093 deactivate = true; 1094 out: 1095 if (deactivate) 1096 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 1097 trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); 1098 return 0; 1099 } 1100 1101 /* 1102 * Complete an allocation of a candidate extent. Remove the extent from both 1103 * trees and update the args structure. 1104 */ 1105 STATIC int 1106 xfs_alloc_cur_finish( 1107 struct xfs_alloc_arg *args, 1108 struct xfs_alloc_cur *acur) 1109 { 1110 int error; 1111 1112 ASSERT(acur->cnt && acur->bnolt); 1113 ASSERT(acur->bno >= acur->rec_bno); 1114 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 1115 ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); 1116 1117 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 1118 acur->rec_len, acur->bno, acur->len, 0); 1119 if (error) 1120 return error; 1121 1122 args->agbno = acur->bno; 1123 args->len = acur->len; 1124 args->wasfromfl = 0; 1125 1126 trace_xfs_alloc_cur(args); 1127 return 0; 1128 } 1129 1130 /* 1131 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 1132 * bno optimized lookup to search for extents with ideal size and locality. 1133 */ 1134 STATIC int 1135 xfs_alloc_cntbt_iter( 1136 struct xfs_alloc_arg *args, 1137 struct xfs_alloc_cur *acur) 1138 { 1139 struct xfs_btree_cur *cur = acur->cnt; 1140 xfs_agblock_t bno; 1141 xfs_extlen_t len, cur_len; 1142 int error; 1143 int i; 1144 1145 if (!xfs_alloc_cur_active(cur)) 1146 return 0; 1147 1148 /* locality optimized lookup */ 1149 cur_len = acur->cur_len; 1150 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 1151 if (error) 1152 return error; 1153 if (i == 0) 1154 return 0; 1155 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1156 if (error) 1157 return error; 1158 1159 /* check the current record and update search length from it */ 1160 error = xfs_alloc_cur_check(args, acur, cur, &i); 1161 if (error) 1162 return error; 1163 ASSERT(len >= acur->cur_len); 1164 acur->cur_len = len; 1165 1166 /* 1167 * We looked up the first record >= [agbno, len] above. The agbno is a 1168 * secondary key and so the current record may lie just before or after 1169 * agbno. If it is past agbno, check the previous record too so long as 1170 * the length matches as it may be closer. Don't check a smaller record 1171 * because that could deactivate our cursor. 1172 */ 1173 if (bno > args->agbno) { 1174 error = xfs_btree_decrement(cur, 0, &i); 1175 if (!error && i) { 1176 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1177 if (!error && i && len == acur->cur_len) 1178 error = xfs_alloc_cur_check(args, acur, cur, 1179 &i); 1180 } 1181 if (error) 1182 return error; 1183 } 1184 1185 /* 1186 * Increment the search key until we find at least one allocation 1187 * candidate or if the extent we found was larger. Otherwise, double the 1188 * search key to optimize the search. Efficiency is more important here 1189 * than absolute best locality. 1190 */ 1191 cur_len <<= 1; 1192 if (!acur->len || acur->cur_len >= cur_len) 1193 acur->cur_len++; 1194 else 1195 acur->cur_len = cur_len; 1196 1197 return error; 1198 } 1199 1200 /* 1201 * Deal with the case where only small freespaces remain. Either return the 1202 * contents of the last freespace record, or allocate space from the freelist if 1203 * there is nothing in the tree. 1204 */ 1205 STATIC int /* error */ 1206 xfs_alloc_ag_vextent_small( 1207 struct xfs_alloc_arg *args, /* allocation argument structure */ 1208 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1209 xfs_agblock_t *fbnop, /* result block number */ 1210 xfs_extlen_t *flenp, /* result length */ 1211 int *stat) /* status: 0-freelist, 1-normal/none */ 1212 { 1213 struct xfs_agf *agf = args->agbp->b_addr; 1214 int error = 0; 1215 xfs_agblock_t fbno = NULLAGBLOCK; 1216 xfs_extlen_t flen = 0; 1217 int i = 0; 1218 1219 /* 1220 * If a cntbt cursor is provided, try to allocate the largest record in 1221 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1222 * allocation. Make sure to respect minleft even when pulling from the 1223 * freelist. 1224 */ 1225 if (ccur) 1226 error = xfs_btree_decrement(ccur, 0, &i); 1227 if (error) 1228 goto error; 1229 if (i) { 1230 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1231 if (error) 1232 goto error; 1233 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1234 xfs_btree_mark_sick(ccur); 1235 error = -EFSCORRUPTED; 1236 goto error; 1237 } 1238 goto out; 1239 } 1240 1241 if (args->minlen != 1 || args->alignment != 1 || 1242 args->resv == XFS_AG_RESV_AGFL || 1243 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1244 goto out; 1245 1246 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1247 &fbno, 0); 1248 if (error) 1249 goto error; 1250 if (fbno == NULLAGBLOCK) 1251 goto out; 1252 1253 xfs_extent_busy_reuse(pag_group(args->pag), fbno, 1, 1254 (args->datatype & XFS_ALLOC_NOBUSY)); 1255 1256 if (args->datatype & XFS_ALLOC_USERDATA) { 1257 struct xfs_buf *bp; 1258 1259 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1260 xfs_agbno_to_daddr(args->pag, fbno), 1261 args->mp->m_bsize, 0, &bp); 1262 if (error) 1263 goto error; 1264 xfs_trans_binval(args->tp, bp); 1265 } 1266 *fbnop = args->agbno = fbno; 1267 *flenp = args->len = 1; 1268 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1269 xfs_btree_mark_sick(ccur); 1270 error = -EFSCORRUPTED; 1271 goto error; 1272 } 1273 args->wasfromfl = 1; 1274 trace_xfs_alloc_small_freelist(args); 1275 1276 /* 1277 * If we're feeding an AGFL block to something that doesn't live in the 1278 * free space, we need to clear out the OWN_AG rmap. 1279 */ 1280 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1281 &XFS_RMAP_OINFO_AG); 1282 if (error) 1283 goto error; 1284 1285 *stat = 0; 1286 return 0; 1287 1288 out: 1289 /* 1290 * Can't do the allocation, give up. 1291 */ 1292 if (flen < args->minlen) { 1293 args->agbno = NULLAGBLOCK; 1294 trace_xfs_alloc_small_notenough(args); 1295 flen = 0; 1296 } 1297 *fbnop = fbno; 1298 *flenp = flen; 1299 *stat = 1; 1300 trace_xfs_alloc_small_done(args); 1301 return 0; 1302 1303 error: 1304 trace_xfs_alloc_small_error(args); 1305 return error; 1306 } 1307 1308 /* 1309 * Allocate a variable extent at exactly agno/bno. 1310 * Extent's length (returned in *len) will be between minlen and maxlen, 1311 * and of the form k * prod + mod unless there's nothing that large. 1312 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1313 */ 1314 STATIC int /* error */ 1315 xfs_alloc_ag_vextent_exact( 1316 xfs_alloc_arg_t *args) /* allocation argument structure */ 1317 { 1318 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1319 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1320 int error; 1321 xfs_agblock_t fbno; /* start block of found extent */ 1322 xfs_extlen_t flen; /* length of found extent */ 1323 xfs_agblock_t tbno; /* start block of busy extent */ 1324 xfs_extlen_t tlen; /* length of busy extent */ 1325 xfs_agblock_t tend; /* end block of busy extent */ 1326 int i; /* success/failure of operation */ 1327 unsigned busy_gen; 1328 1329 ASSERT(args->alignment == 1); 1330 1331 /* 1332 * Allocate/initialize a cursor for the by-number freespace btree. 1333 */ 1334 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 1335 args->pag); 1336 1337 /* 1338 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1339 * Look for the closest free block <= bno, it must contain bno 1340 * if any free block does. 1341 */ 1342 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1343 if (error) 1344 goto error0; 1345 if (!i) 1346 goto not_found; 1347 1348 /* 1349 * Grab the freespace record. 1350 */ 1351 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1352 if (error) 1353 goto error0; 1354 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1355 xfs_btree_mark_sick(bno_cur); 1356 error = -EFSCORRUPTED; 1357 goto error0; 1358 } 1359 ASSERT(fbno <= args->agbno); 1360 1361 /* 1362 * Check for overlapping busy extents. 1363 */ 1364 tbno = fbno; 1365 tlen = flen; 1366 xfs_extent_busy_trim(pag_group(args->pag), args->minlen, args->maxlen, 1367 &tbno, &tlen, &busy_gen); 1368 1369 /* 1370 * Give up if the start of the extent is busy, or the freespace isn't 1371 * long enough for the minimum request. 1372 */ 1373 if (tbno > args->agbno) 1374 goto not_found; 1375 if (tlen < args->minlen) 1376 goto not_found; 1377 tend = tbno + tlen; 1378 if (tend < args->agbno + args->minlen) 1379 goto not_found; 1380 1381 /* 1382 * End of extent will be smaller of the freespace end and the 1383 * maximal requested end. 1384 * 1385 * Fix the length according to mod and prod if given. 1386 */ 1387 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1388 - args->agbno; 1389 xfs_alloc_fix_len(args); 1390 ASSERT(args->agbno + args->len <= tend); 1391 1392 /* 1393 * We are allocating agbno for args->len 1394 * Allocate/initialize a cursor for the by-size btree. 1395 */ 1396 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1397 args->pag); 1398 ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); 1399 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1400 args->len, XFSA_FIXUP_BNO_OK); 1401 if (error) { 1402 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1403 goto error0; 1404 } 1405 1406 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1407 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1408 1409 args->wasfromfl = 0; 1410 trace_xfs_alloc_exact_done(args); 1411 return 0; 1412 1413 not_found: 1414 /* Didn't find it, return null. */ 1415 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1416 args->agbno = NULLAGBLOCK; 1417 trace_xfs_alloc_exact_notfound(args); 1418 return 0; 1419 1420 error0: 1421 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1422 trace_xfs_alloc_exact_error(args); 1423 return error; 1424 } 1425 1426 /* 1427 * Search a given number of btree records in a given direction. Check each 1428 * record against the good extent we've already found. 1429 */ 1430 STATIC int 1431 xfs_alloc_walk_iter( 1432 struct xfs_alloc_arg *args, 1433 struct xfs_alloc_cur *acur, 1434 struct xfs_btree_cur *cur, 1435 bool increment, 1436 bool find_one, /* quit on first candidate */ 1437 int count, /* rec count (-1 for infinite) */ 1438 int *stat) 1439 { 1440 int error; 1441 int i; 1442 1443 *stat = 0; 1444 1445 /* 1446 * Search so long as the cursor is active or we find a better extent. 1447 * The cursor is deactivated if it extends beyond the range of the 1448 * current allocation candidate. 1449 */ 1450 while (xfs_alloc_cur_active(cur) && count) { 1451 error = xfs_alloc_cur_check(args, acur, cur, &i); 1452 if (error) 1453 return error; 1454 if (i == 1) { 1455 *stat = 1; 1456 if (find_one) 1457 break; 1458 } 1459 if (!xfs_alloc_cur_active(cur)) 1460 break; 1461 1462 if (increment) 1463 error = xfs_btree_increment(cur, 0, &i); 1464 else 1465 error = xfs_btree_decrement(cur, 0, &i); 1466 if (error) 1467 return error; 1468 if (i == 0) 1469 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 1470 1471 if (count > 0) 1472 count--; 1473 } 1474 1475 return 0; 1476 } 1477 1478 /* 1479 * Search the by-bno and by-size btrees in parallel in search of an extent with 1480 * ideal locality based on the NEAR mode ->agbno locality hint. 1481 */ 1482 STATIC int 1483 xfs_alloc_ag_vextent_locality( 1484 struct xfs_alloc_arg *args, 1485 struct xfs_alloc_cur *acur, 1486 int *stat) 1487 { 1488 struct xfs_btree_cur *fbcur = NULL; 1489 int error; 1490 int i; 1491 bool fbinc; 1492 1493 ASSERT(acur->len == 0); 1494 1495 *stat = 0; 1496 1497 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1498 if (error) 1499 return error; 1500 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1501 if (error) 1502 return error; 1503 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1504 if (error) 1505 return error; 1506 1507 /* 1508 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1509 * right and lookup the closest extent to the locality hint for each 1510 * extent size key in the cntbt. The entire search terminates 1511 * immediately on a bnobt hit because that means we've found best case 1512 * locality. Otherwise the search continues until the cntbt cursor runs 1513 * off the end of the tree. If no allocation candidate is found at this 1514 * point, give up on locality, walk backwards from the end of the cntbt 1515 * and take the first available extent. 1516 * 1517 * The parallel tree searches balance each other out to provide fairly 1518 * consistent performance for various situations. The bnobt search can 1519 * have pathological behavior in the worst case scenario of larger 1520 * allocation requests and fragmented free space. On the other hand, the 1521 * bnobt is able to satisfy most smaller allocation requests much more 1522 * quickly than the cntbt. The cntbt search can sift through fragmented 1523 * free space and sets of free extents for larger allocation requests 1524 * more quickly than the bnobt. Since the locality hint is just a hint 1525 * and we don't want to scan the entire bnobt for perfect locality, the 1526 * cntbt search essentially bounds the bnobt search such that we can 1527 * find good enough locality at reasonable performance in most cases. 1528 */ 1529 while (xfs_alloc_cur_active(acur->bnolt) || 1530 xfs_alloc_cur_active(acur->bnogt) || 1531 xfs_alloc_cur_active(acur->cnt)) { 1532 1533 trace_xfs_alloc_cur_lookup(args); 1534 1535 /* 1536 * Search the bnobt left and right. In the case of a hit, finish 1537 * the search in the opposite direction and we're done. 1538 */ 1539 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1540 true, 1, &i); 1541 if (error) 1542 return error; 1543 if (i == 1) { 1544 trace_xfs_alloc_cur_left(args); 1545 fbcur = acur->bnogt; 1546 fbinc = true; 1547 break; 1548 } 1549 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1550 1, &i); 1551 if (error) 1552 return error; 1553 if (i == 1) { 1554 trace_xfs_alloc_cur_right(args); 1555 fbcur = acur->bnolt; 1556 fbinc = false; 1557 break; 1558 } 1559 1560 /* 1561 * Check the extent with best locality based on the current 1562 * extent size search key and keep track of the best candidate. 1563 */ 1564 error = xfs_alloc_cntbt_iter(args, acur); 1565 if (error) 1566 return error; 1567 if (!xfs_alloc_cur_active(acur->cnt)) { 1568 trace_xfs_alloc_cur_lookup_done(args); 1569 break; 1570 } 1571 } 1572 1573 /* 1574 * If we failed to find anything due to busy extents, return empty 1575 * handed so the caller can flush and retry. If no busy extents were 1576 * found, walk backwards from the end of the cntbt as a last resort. 1577 */ 1578 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1579 error = xfs_btree_decrement(acur->cnt, 0, &i); 1580 if (error) 1581 return error; 1582 if (i) { 1583 acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 1584 fbcur = acur->cnt; 1585 fbinc = false; 1586 } 1587 } 1588 1589 /* 1590 * Search in the opposite direction for a better entry in the case of 1591 * a bnobt hit or walk backwards from the end of the cntbt. 1592 */ 1593 if (fbcur) { 1594 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1595 &i); 1596 if (error) 1597 return error; 1598 } 1599 1600 if (acur->len) 1601 *stat = 1; 1602 1603 return 0; 1604 } 1605 1606 /* Check the last block of the cnt btree for allocations. */ 1607 static int 1608 xfs_alloc_ag_vextent_lastblock( 1609 struct xfs_alloc_arg *args, 1610 struct xfs_alloc_cur *acur, 1611 xfs_agblock_t *bno, 1612 xfs_extlen_t *len, 1613 bool *allocated) 1614 { 1615 int error; 1616 int i; 1617 1618 #ifdef DEBUG 1619 /* Randomly don't execute the first algorithm. */ 1620 if (get_random_u32_below(2)) 1621 return 0; 1622 #endif 1623 1624 /* 1625 * Start from the entry that lookup found, sequence through all larger 1626 * free blocks. If we're actually pointing at a record smaller than 1627 * maxlen, go to the start of this block, and skip all those smaller 1628 * than minlen. 1629 */ 1630 if (*len || args->alignment > 1) { 1631 acur->cnt->bc_levels[0].ptr = 1; 1632 do { 1633 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1634 if (error) 1635 return error; 1636 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1637 xfs_btree_mark_sick(acur->cnt); 1638 return -EFSCORRUPTED; 1639 } 1640 if (*len >= args->minlen) 1641 break; 1642 error = xfs_btree_increment(acur->cnt, 0, &i); 1643 if (error) 1644 return error; 1645 } while (i); 1646 ASSERT(*len >= args->minlen); 1647 if (!i) 1648 return 0; 1649 } 1650 1651 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1652 if (error) 1653 return error; 1654 1655 /* 1656 * It didn't work. We COULD be in a case where there's a good record 1657 * somewhere, so try again. 1658 */ 1659 if (acur->len == 0) 1660 return 0; 1661 1662 trace_xfs_alloc_near_first(args); 1663 *allocated = true; 1664 return 0; 1665 } 1666 1667 /* 1668 * Allocate a variable extent near bno in the allocation group agno. 1669 * Extent's length (returned in len) will be between minlen and maxlen, 1670 * and of the form k * prod + mod unless there's nothing that large. 1671 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1672 */ 1673 STATIC int 1674 xfs_alloc_ag_vextent_near( 1675 struct xfs_alloc_arg *args, 1676 uint32_t alloc_flags) 1677 { 1678 struct xfs_alloc_cur acur = {}; 1679 int error; /* error code */ 1680 int i; /* result code, temporary */ 1681 xfs_agblock_t bno; 1682 xfs_extlen_t len; 1683 1684 /* handle uninitialized agbno range so caller doesn't have to */ 1685 if (!args->min_agbno && !args->max_agbno) 1686 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1687 ASSERT(args->min_agbno <= args->max_agbno); 1688 1689 /* clamp agbno to the range if it's outside */ 1690 if (args->agbno < args->min_agbno) 1691 args->agbno = args->min_agbno; 1692 if (args->agbno > args->max_agbno) 1693 args->agbno = args->max_agbno; 1694 1695 /* Retry once quickly if we find busy extents before blocking. */ 1696 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1697 restart: 1698 len = 0; 1699 1700 /* 1701 * Set up cursors and see if there are any free extents as big as 1702 * maxlen. If not, pick the last entry in the tree unless the tree is 1703 * empty. 1704 */ 1705 error = xfs_alloc_cur_setup(args, &acur); 1706 if (error == -ENOSPC) { 1707 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1708 &len, &i); 1709 if (error) 1710 goto out; 1711 if (i == 0 || len == 0) { 1712 trace_xfs_alloc_near_noentry(args); 1713 goto out; 1714 } 1715 ASSERT(i == 1); 1716 } else if (error) { 1717 goto out; 1718 } 1719 1720 /* 1721 * First algorithm. 1722 * If the requested extent is large wrt the freespaces available 1723 * in this a.g., then the cursor will be pointing to a btree entry 1724 * near the right edge of the tree. If it's in the last btree leaf 1725 * block, then we just examine all the entries in that block 1726 * that are big enough, and pick the best one. 1727 */ 1728 if (xfs_btree_islastblock(acur.cnt, 0)) { 1729 bool allocated = false; 1730 1731 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1732 &allocated); 1733 if (error) 1734 goto out; 1735 if (allocated) 1736 goto alloc_finish; 1737 } 1738 1739 /* 1740 * Second algorithm. Combined cntbt and bnobt search to find ideal 1741 * locality. 1742 */ 1743 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1744 if (error) 1745 goto out; 1746 1747 /* 1748 * If we couldn't get anything, give up. 1749 */ 1750 if (!acur.len) { 1751 if (acur.busy) { 1752 /* 1753 * Our only valid extents must have been busy. Flush and 1754 * retry the allocation again. If we get an -EAGAIN 1755 * error, we're being told that a deadlock was avoided 1756 * and the current transaction needs committing before 1757 * the allocation can be retried. 1758 */ 1759 trace_xfs_alloc_near_busy(args); 1760 error = xfs_extent_busy_flush(args->tp, 1761 pag_group(args->pag), acur.busy_gen, 1762 alloc_flags); 1763 if (error) 1764 goto out; 1765 1766 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1767 goto restart; 1768 } 1769 trace_xfs_alloc_size_neither(args); 1770 args->agbno = NULLAGBLOCK; 1771 goto out; 1772 } 1773 1774 alloc_finish: 1775 /* fix up btrees on a successful allocation */ 1776 error = xfs_alloc_cur_finish(args, &acur); 1777 1778 out: 1779 xfs_alloc_cur_close(&acur, error); 1780 return error; 1781 } 1782 1783 /* 1784 * Allocate a variable extent anywhere in the allocation group agno. 1785 * Extent's length (returned in len) will be between minlen and maxlen, 1786 * and of the form k * prod + mod unless there's nothing that large. 1787 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1788 */ 1789 static int 1790 xfs_alloc_ag_vextent_size( 1791 struct xfs_alloc_arg *args, 1792 uint32_t alloc_flags) 1793 { 1794 struct xfs_agf *agf = args->agbp->b_addr; 1795 struct xfs_btree_cur *bno_cur; 1796 struct xfs_btree_cur *cnt_cur; 1797 xfs_agblock_t fbno; /* start of found freespace */ 1798 xfs_extlen_t flen; /* length of found freespace */ 1799 xfs_agblock_t rbno; /* returned block number */ 1800 xfs_extlen_t rlen; /* length of returned extent */ 1801 bool busy; 1802 unsigned busy_gen; 1803 int error; 1804 int i; 1805 1806 /* Retry once quickly if we find busy extents before blocking. */ 1807 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1808 restart: 1809 /* 1810 * Allocate and initialize a cursor for the by-size btree. 1811 */ 1812 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1813 args->pag); 1814 bno_cur = NULL; 1815 1816 /* 1817 * Look for an entry >= maxlen+alignment-1 blocks. 1818 */ 1819 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1820 args->maxlen + args->alignment - 1, &i))) 1821 goto error0; 1822 1823 /* 1824 * If none then we have to settle for a smaller extent. In the case that 1825 * there are no large extents, this will return the last entry in the 1826 * tree unless the tree is empty. In the case that there are only busy 1827 * large extents, this will return the largest small extent unless there 1828 * are no smaller extents available. 1829 */ 1830 if (!i) { 1831 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1832 &fbno, &flen, &i); 1833 if (error) 1834 goto error0; 1835 if (i == 0 || flen == 0) { 1836 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1837 trace_xfs_alloc_size_noentry(args); 1838 return 0; 1839 } 1840 ASSERT(i == 1); 1841 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1842 &rlen, &busy_gen); 1843 } else { 1844 /* 1845 * Search for a non-busy extent that is large enough. 1846 */ 1847 for (;;) { 1848 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1849 if (error) 1850 goto error0; 1851 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1852 xfs_btree_mark_sick(cnt_cur); 1853 error = -EFSCORRUPTED; 1854 goto error0; 1855 } 1856 1857 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1858 &rbno, &rlen, &busy_gen); 1859 1860 if (rlen >= args->maxlen) 1861 break; 1862 1863 error = xfs_btree_increment(cnt_cur, 0, &i); 1864 if (error) 1865 goto error0; 1866 if (i) 1867 continue; 1868 1869 /* 1870 * Our only valid extents must have been busy. Flush and 1871 * retry the allocation again. If we get an -EAGAIN 1872 * error, we're being told that a deadlock was avoided 1873 * and the current transaction needs committing before 1874 * the allocation can be retried. 1875 */ 1876 trace_xfs_alloc_size_busy(args); 1877 error = xfs_extent_busy_flush(args->tp, 1878 pag_group(args->pag), busy_gen, 1879 alloc_flags); 1880 if (error) 1881 goto error0; 1882 1883 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1884 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1885 goto restart; 1886 } 1887 } 1888 1889 /* 1890 * In the first case above, we got the last entry in the 1891 * by-size btree. Now we check to see if the space hits maxlen 1892 * once aligned; if not, we search left for something better. 1893 * This can't happen in the second case above. 1894 */ 1895 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1896 if (XFS_IS_CORRUPT(args->mp, 1897 rlen != 0 && 1898 (rlen > flen || 1899 rbno + rlen > fbno + flen))) { 1900 xfs_btree_mark_sick(cnt_cur); 1901 error = -EFSCORRUPTED; 1902 goto error0; 1903 } 1904 if (rlen < args->maxlen) { 1905 xfs_agblock_t bestfbno; 1906 xfs_extlen_t bestflen; 1907 xfs_agblock_t bestrbno; 1908 xfs_extlen_t bestrlen; 1909 1910 bestrlen = rlen; 1911 bestrbno = rbno; 1912 bestflen = flen; 1913 bestfbno = fbno; 1914 for (;;) { 1915 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1916 goto error0; 1917 if (i == 0) 1918 break; 1919 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1920 &i))) 1921 goto error0; 1922 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1923 xfs_btree_mark_sick(cnt_cur); 1924 error = -EFSCORRUPTED; 1925 goto error0; 1926 } 1927 if (flen <= bestrlen) 1928 break; 1929 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1930 &rbno, &rlen, &busy_gen); 1931 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1932 if (XFS_IS_CORRUPT(args->mp, 1933 rlen != 0 && 1934 (rlen > flen || 1935 rbno + rlen > fbno + flen))) { 1936 xfs_btree_mark_sick(cnt_cur); 1937 error = -EFSCORRUPTED; 1938 goto error0; 1939 } 1940 if (rlen > bestrlen) { 1941 bestrlen = rlen; 1942 bestrbno = rbno; 1943 bestflen = flen; 1944 bestfbno = fbno; 1945 if (rlen == args->maxlen) 1946 break; 1947 } 1948 } 1949 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1950 &i))) 1951 goto error0; 1952 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1953 xfs_btree_mark_sick(cnt_cur); 1954 error = -EFSCORRUPTED; 1955 goto error0; 1956 } 1957 rlen = bestrlen; 1958 rbno = bestrbno; 1959 flen = bestflen; 1960 fbno = bestfbno; 1961 } 1962 args->wasfromfl = 0; 1963 /* 1964 * Fix up the length. 1965 */ 1966 args->len = rlen; 1967 if (rlen < args->minlen) { 1968 if (busy) { 1969 /* 1970 * Our only valid extents must have been busy. Flush and 1971 * retry the allocation again. If we get an -EAGAIN 1972 * error, we're being told that a deadlock was avoided 1973 * and the current transaction needs committing before 1974 * the allocation can be retried. 1975 */ 1976 trace_xfs_alloc_size_busy(args); 1977 error = xfs_extent_busy_flush(args->tp, 1978 pag_group(args->pag), busy_gen, 1979 alloc_flags); 1980 if (error) 1981 goto error0; 1982 1983 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1984 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1985 goto restart; 1986 } 1987 goto out_nominleft; 1988 } 1989 xfs_alloc_fix_len(args); 1990 1991 rlen = args->len; 1992 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1993 xfs_btree_mark_sick(cnt_cur); 1994 error = -EFSCORRUPTED; 1995 goto error0; 1996 } 1997 /* 1998 * Allocate and initialize a cursor for the by-block tree. 1999 */ 2000 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 2001 args->pag); 2002 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 2003 rbno, rlen, XFSA_FIXUP_CNT_OK))) 2004 goto error0; 2005 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2006 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2007 cnt_cur = bno_cur = NULL; 2008 args->len = rlen; 2009 args->agbno = rbno; 2010 if (XFS_IS_CORRUPT(args->mp, 2011 args->agbno + args->len > 2012 be32_to_cpu(agf->agf_length))) { 2013 xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); 2014 error = -EFSCORRUPTED; 2015 goto error0; 2016 } 2017 trace_xfs_alloc_size_done(args); 2018 return 0; 2019 2020 error0: 2021 trace_xfs_alloc_size_error(args); 2022 if (cnt_cur) 2023 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2024 if (bno_cur) 2025 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2026 return error; 2027 2028 out_nominleft: 2029 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2030 trace_xfs_alloc_size_nominleft(args); 2031 args->agbno = NULLAGBLOCK; 2032 return 0; 2033 } 2034 2035 /* 2036 * Free the extent starting at agno/bno for length. 2037 */ 2038 int 2039 xfs_free_ag_extent( 2040 struct xfs_trans *tp, 2041 struct xfs_buf *agbp, 2042 xfs_agblock_t bno, 2043 xfs_extlen_t len, 2044 const struct xfs_owner_info *oinfo, 2045 enum xfs_ag_resv_type type) 2046 { 2047 struct xfs_mount *mp; 2048 struct xfs_btree_cur *bno_cur; 2049 struct xfs_btree_cur *cnt_cur; 2050 xfs_agblock_t gtbno; /* start of right neighbor */ 2051 xfs_extlen_t gtlen; /* length of right neighbor */ 2052 xfs_agblock_t ltbno; /* start of left neighbor */ 2053 xfs_extlen_t ltlen; /* length of left neighbor */ 2054 xfs_agblock_t nbno; /* new starting block of freesp */ 2055 xfs_extlen_t nlen; /* new length of freespace */ 2056 int haveleft; /* have a left neighbor */ 2057 int haveright; /* have a right neighbor */ 2058 int i; 2059 int error; 2060 struct xfs_perag *pag = agbp->b_pag; 2061 bool fixup_longest = false; 2062 2063 bno_cur = cnt_cur = NULL; 2064 mp = tp->t_mountp; 2065 2066 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 2067 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 2068 if (error) 2069 goto error0; 2070 } 2071 2072 /* 2073 * Allocate and initialize a cursor for the by-block btree. 2074 */ 2075 bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 2076 /* 2077 * Look for a neighboring block on the left (lower block numbers) 2078 * that is contiguous with this space. 2079 */ 2080 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 2081 goto error0; 2082 if (haveleft) { 2083 /* 2084 * There is a block to our left. 2085 */ 2086 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 2087 goto error0; 2088 if (XFS_IS_CORRUPT(mp, i != 1)) { 2089 xfs_btree_mark_sick(bno_cur); 2090 error = -EFSCORRUPTED; 2091 goto error0; 2092 } 2093 /* 2094 * It's not contiguous, though. 2095 */ 2096 if (ltbno + ltlen < bno) 2097 haveleft = 0; 2098 else { 2099 /* 2100 * If this failure happens the request to free this 2101 * space was invalid, it's (partly) already free. 2102 * Very bad. 2103 */ 2104 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 2105 xfs_btree_mark_sick(bno_cur); 2106 error = -EFSCORRUPTED; 2107 goto error0; 2108 } 2109 } 2110 } 2111 /* 2112 * Look for a neighboring block on the right (higher block numbers) 2113 * that is contiguous with this space. 2114 */ 2115 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 2116 goto error0; 2117 if (haveright) { 2118 /* 2119 * There is a block to our right. 2120 */ 2121 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 2122 goto error0; 2123 if (XFS_IS_CORRUPT(mp, i != 1)) { 2124 xfs_btree_mark_sick(bno_cur); 2125 error = -EFSCORRUPTED; 2126 goto error0; 2127 } 2128 /* 2129 * It's not contiguous, though. 2130 */ 2131 if (bno + len < gtbno) 2132 haveright = 0; 2133 else { 2134 /* 2135 * If this failure happens the request to free this 2136 * space was invalid, it's (partly) already free. 2137 * Very bad. 2138 */ 2139 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 2140 xfs_btree_mark_sick(bno_cur); 2141 error = -EFSCORRUPTED; 2142 goto error0; 2143 } 2144 } 2145 } 2146 /* 2147 * Now allocate and initialize a cursor for the by-size tree. 2148 */ 2149 cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 2150 /* 2151 * Have both left and right contiguous neighbors. 2152 * Merge all three into a single free block. 2153 */ 2154 if (haveleft && haveright) { 2155 /* 2156 * Delete the old by-size entry on the left. 2157 */ 2158 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2159 goto error0; 2160 if (XFS_IS_CORRUPT(mp, i != 1)) { 2161 xfs_btree_mark_sick(cnt_cur); 2162 error = -EFSCORRUPTED; 2163 goto error0; 2164 } 2165 if ((error = xfs_btree_delete(cnt_cur, &i))) 2166 goto error0; 2167 if (XFS_IS_CORRUPT(mp, i != 1)) { 2168 xfs_btree_mark_sick(cnt_cur); 2169 error = -EFSCORRUPTED; 2170 goto error0; 2171 } 2172 /* 2173 * Delete the old by-size entry on the right. 2174 */ 2175 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2176 goto error0; 2177 if (XFS_IS_CORRUPT(mp, i != 1)) { 2178 xfs_btree_mark_sick(cnt_cur); 2179 error = -EFSCORRUPTED; 2180 goto error0; 2181 } 2182 if ((error = xfs_btree_delete(cnt_cur, &i))) 2183 goto error0; 2184 if (XFS_IS_CORRUPT(mp, i != 1)) { 2185 xfs_btree_mark_sick(cnt_cur); 2186 error = -EFSCORRUPTED; 2187 goto error0; 2188 } 2189 /* 2190 * Delete the old by-block entry for the right block. 2191 */ 2192 if ((error = xfs_btree_delete(bno_cur, &i))) 2193 goto error0; 2194 if (XFS_IS_CORRUPT(mp, i != 1)) { 2195 xfs_btree_mark_sick(bno_cur); 2196 error = -EFSCORRUPTED; 2197 goto error0; 2198 } 2199 /* 2200 * Move the by-block cursor back to the left neighbor. 2201 */ 2202 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2203 goto error0; 2204 if (XFS_IS_CORRUPT(mp, i != 1)) { 2205 xfs_btree_mark_sick(bno_cur); 2206 error = -EFSCORRUPTED; 2207 goto error0; 2208 } 2209 #ifdef DEBUG 2210 /* 2211 * Check that this is the right record: delete didn't 2212 * mangle the cursor. 2213 */ 2214 { 2215 xfs_agblock_t xxbno; 2216 xfs_extlen_t xxlen; 2217 2218 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2219 &i))) 2220 goto error0; 2221 if (XFS_IS_CORRUPT(mp, 2222 i != 1 || 2223 xxbno != ltbno || 2224 xxlen != ltlen)) { 2225 xfs_btree_mark_sick(bno_cur); 2226 error = -EFSCORRUPTED; 2227 goto error0; 2228 } 2229 } 2230 #endif 2231 /* 2232 * Update remaining by-block entry to the new, joined block. 2233 */ 2234 nbno = ltbno; 2235 nlen = len + ltlen + gtlen; 2236 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2237 goto error0; 2238 } 2239 /* 2240 * Have only a left contiguous neighbor. 2241 * Merge it together with the new freespace. 2242 */ 2243 else if (haveleft) { 2244 /* 2245 * Delete the old by-size entry on the left. 2246 */ 2247 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2248 goto error0; 2249 if (XFS_IS_CORRUPT(mp, i != 1)) { 2250 xfs_btree_mark_sick(cnt_cur); 2251 error = -EFSCORRUPTED; 2252 goto error0; 2253 } 2254 if ((error = xfs_btree_delete(cnt_cur, &i))) 2255 goto error0; 2256 if (XFS_IS_CORRUPT(mp, i != 1)) { 2257 xfs_btree_mark_sick(cnt_cur); 2258 error = -EFSCORRUPTED; 2259 goto error0; 2260 } 2261 /* 2262 * Back up the by-block cursor to the left neighbor, and 2263 * update its length. 2264 */ 2265 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2266 goto error0; 2267 if (XFS_IS_CORRUPT(mp, i != 1)) { 2268 xfs_btree_mark_sick(bno_cur); 2269 error = -EFSCORRUPTED; 2270 goto error0; 2271 } 2272 nbno = ltbno; 2273 nlen = len + ltlen; 2274 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2275 goto error0; 2276 } 2277 /* 2278 * Have only a right contiguous neighbor. 2279 * Merge it together with the new freespace. 2280 */ 2281 else if (haveright) { 2282 /* 2283 * Delete the old by-size entry on the right. 2284 */ 2285 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2286 goto error0; 2287 if (XFS_IS_CORRUPT(mp, i != 1)) { 2288 xfs_btree_mark_sick(cnt_cur); 2289 error = -EFSCORRUPTED; 2290 goto error0; 2291 } 2292 if ((error = xfs_btree_delete(cnt_cur, &i))) 2293 goto error0; 2294 if (XFS_IS_CORRUPT(mp, i != 1)) { 2295 xfs_btree_mark_sick(cnt_cur); 2296 error = -EFSCORRUPTED; 2297 goto error0; 2298 } 2299 /* 2300 * Update the starting block and length of the right 2301 * neighbor in the by-block tree. 2302 */ 2303 nbno = bno; 2304 nlen = len + gtlen; 2305 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2306 goto error0; 2307 } 2308 /* 2309 * No contiguous neighbors. 2310 * Insert the new freespace into the by-block tree. 2311 */ 2312 else { 2313 nbno = bno; 2314 nlen = len; 2315 if ((error = xfs_btree_insert(bno_cur, &i))) 2316 goto error0; 2317 if (XFS_IS_CORRUPT(mp, i != 1)) { 2318 xfs_btree_mark_sick(bno_cur); 2319 error = -EFSCORRUPTED; 2320 goto error0; 2321 } 2322 } 2323 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2324 bno_cur = NULL; 2325 2326 /* 2327 * In all cases we need to insert the new freespace in the by-size tree. 2328 * 2329 * If this new freespace is being inserted in the block that contains 2330 * the largest free space in the btree, make sure we also fix up the 2331 * agf->agf-longest tracker field. 2332 */ 2333 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2334 goto error0; 2335 if (XFS_IS_CORRUPT(mp, i != 0)) { 2336 xfs_btree_mark_sick(cnt_cur); 2337 error = -EFSCORRUPTED; 2338 goto error0; 2339 } 2340 if (xfs_alloc_cursor_at_lastrec(cnt_cur)) 2341 fixup_longest = true; 2342 if ((error = xfs_btree_insert(cnt_cur, &i))) 2343 goto error0; 2344 if (XFS_IS_CORRUPT(mp, i != 1)) { 2345 xfs_btree_mark_sick(cnt_cur); 2346 error = -EFSCORRUPTED; 2347 goto error0; 2348 } 2349 if (fixup_longest) { 2350 error = xfs_alloc_fixup_longest(cnt_cur); 2351 if (error) 2352 goto error0; 2353 } 2354 2355 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2356 cnt_cur = NULL; 2357 2358 /* 2359 * Update the freespace totals in the ag and superblock. 2360 */ 2361 error = xfs_alloc_update_counters(tp, agbp, len); 2362 xfs_ag_resv_free_extent(pag, type, tp, len); 2363 if (error) 2364 goto error0; 2365 2366 XFS_STATS_INC(mp, xs_freex); 2367 XFS_STATS_ADD(mp, xs_freeb, len); 2368 2369 trace_xfs_free_extent(pag, bno, len, type, haveleft, haveright); 2370 2371 return 0; 2372 2373 error0: 2374 trace_xfs_free_extent(pag, bno, len, type, -1, -1); 2375 if (bno_cur) 2376 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2377 if (cnt_cur) 2378 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2379 return error; 2380 } 2381 2382 /* 2383 * Visible (exported) allocation/free functions. 2384 * Some of these are used just by xfs_alloc_btree.c and this file. 2385 */ 2386 2387 /* 2388 * Compute and fill in value of m_alloc_maxlevels. 2389 */ 2390 void 2391 xfs_alloc_compute_maxlevels( 2392 xfs_mount_t *mp) /* file system mount structure */ 2393 { 2394 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2395 (mp->m_sb.sb_agblocks + 1) / 2); 2396 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2397 } 2398 2399 /* 2400 * Find the length of the longest extent in an AG. The 'need' parameter 2401 * specifies how much space we're going to need for the AGFL and the 2402 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2403 * other callers. 2404 */ 2405 xfs_extlen_t 2406 xfs_alloc_longest_free_extent( 2407 struct xfs_perag *pag, 2408 xfs_extlen_t need, 2409 xfs_extlen_t reserved) 2410 { 2411 xfs_extlen_t delta = 0; 2412 2413 /* 2414 * If the AGFL needs a recharge, we'll have to subtract that from the 2415 * longest extent. 2416 */ 2417 if (need > pag->pagf_flcount) 2418 delta = need - pag->pagf_flcount; 2419 2420 /* 2421 * If we cannot maintain others' reservations with space from the 2422 * not-longest freesp extents, we'll have to subtract /that/ from 2423 * the longest extent too. 2424 */ 2425 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2426 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2427 2428 /* 2429 * If the longest extent is long enough to satisfy all the 2430 * reservations and AGFL rules in place, we can return this extent. 2431 */ 2432 if (pag->pagf_longest > delta) 2433 return min_t(xfs_extlen_t, pag_mount(pag)->m_ag_max_usable, 2434 pag->pagf_longest - delta); 2435 2436 /* Otherwise, let the caller try for 1 block if there's space. */ 2437 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2438 } 2439 2440 /* 2441 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2442 * return the largest possible minimum length. 2443 */ 2444 unsigned int 2445 xfs_alloc_min_freelist( 2446 struct xfs_mount *mp, 2447 struct xfs_perag *pag) 2448 { 2449 /* AG btrees have at least 1 level. */ 2450 const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; 2451 const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; 2452 const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; 2453 unsigned int min_free; 2454 2455 ASSERT(mp->m_alloc_maxlevels > 0); 2456 2457 /* 2458 * For a btree shorter than the maximum height, the worst case is that 2459 * every level gets split and a new level is added, then while inserting 2460 * another entry to refill the AGFL, every level under the old root gets 2461 * split again. This is: 2462 * 2463 * (full height split reservation) + (AGFL refill split height) 2464 * = (current height + 1) + (current height - 1) 2465 * = (new height) + (new height - 2) 2466 * = 2 * new height - 2 2467 * 2468 * For a btree of maximum height, the worst case is that every level 2469 * under the root gets split, then while inserting another entry to 2470 * refill the AGFL, every level under the root gets split again. This is 2471 * also: 2472 * 2473 * 2 * (current height - 1) 2474 * = 2 * (new height - 1) 2475 * = 2 * new height - 2 2476 */ 2477 2478 /* space needed by-bno freespace btree */ 2479 min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2480 /* space needed by-size freespace btree */ 2481 min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2482 /* space needed reverse mapping used space btree */ 2483 if (xfs_has_rmapbt(mp)) 2484 min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; 2485 return min_free; 2486 } 2487 2488 /* 2489 * Check if the operation we are fixing up the freelist for should go ahead or 2490 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2491 * is dependent on whether the size and shape of free space available will 2492 * permit the requested allocation to take place. 2493 */ 2494 static bool 2495 xfs_alloc_space_available( 2496 struct xfs_alloc_arg *args, 2497 xfs_extlen_t min_free, 2498 int flags) 2499 { 2500 struct xfs_perag *pag = args->pag; 2501 xfs_extlen_t alloc_len, longest; 2502 xfs_extlen_t reservation; /* blocks that are still reserved */ 2503 int available; 2504 xfs_extlen_t agflcount; 2505 2506 if (flags & XFS_ALLOC_FLAG_FREEING) 2507 return true; 2508 2509 reservation = xfs_ag_resv_needed(pag, args->resv); 2510 2511 /* do we have enough contiguous free space for the allocation? */ 2512 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2513 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2514 if (longest < alloc_len) 2515 return false; 2516 2517 /* 2518 * Do we have enough free space remaining for the allocation? Don't 2519 * account extra agfl blocks because we are about to defer free them, 2520 * making them unavailable until the current transaction commits. 2521 */ 2522 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2523 available = (int)(pag->pagf_freeblks + agflcount - 2524 reservation - min_free - args->minleft); 2525 if (available < (int)max(args->total, alloc_len)) 2526 return false; 2527 2528 /* 2529 * Clamp maxlen to the amount of free space available for the actual 2530 * extent allocation. 2531 */ 2532 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2533 args->maxlen = available; 2534 ASSERT(args->maxlen > 0); 2535 ASSERT(args->maxlen >= args->minlen); 2536 } 2537 2538 return true; 2539 } 2540 2541 /* 2542 * Check the agfl fields of the agf for inconsistency or corruption. 2543 * 2544 * The original purpose was to detect an agfl header padding mismatch between 2545 * current and early v5 kernels. This problem manifests as a 1-slot size 2546 * difference between the on-disk flcount and the active [first, last] range of 2547 * a wrapped agfl. 2548 * 2549 * However, we need to use these same checks to catch agfl count corruptions 2550 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either 2551 * way, we need to reset the agfl and warn the user. 2552 * 2553 * Return true if a reset is required before the agfl can be used, false 2554 * otherwise. 2555 */ 2556 static bool 2557 xfs_agfl_needs_reset( 2558 struct xfs_mount *mp, 2559 struct xfs_agf *agf) 2560 { 2561 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2562 uint32_t l = be32_to_cpu(agf->agf_fllast); 2563 uint32_t c = be32_to_cpu(agf->agf_flcount); 2564 int agfl_size = xfs_agfl_size(mp); 2565 int active; 2566 2567 /* 2568 * The agf read verifier catches severe corruption of these fields. 2569 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2570 * the verifier allows it. 2571 */ 2572 if (f >= agfl_size || l >= agfl_size) 2573 return true; 2574 if (c > agfl_size) 2575 return true; 2576 2577 /* 2578 * Check consistency between the on-disk count and the active range. An 2579 * agfl padding mismatch manifests as an inconsistent flcount. 2580 */ 2581 if (c && l >= f) 2582 active = l - f + 1; 2583 else if (c) 2584 active = agfl_size - f + l + 1; 2585 else 2586 active = 0; 2587 2588 return active != c; 2589 } 2590 2591 /* 2592 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2593 * agfl content cannot be trusted. Warn the user that a repair is required to 2594 * recover leaked blocks. 2595 * 2596 * The purpose of this mechanism is to handle filesystems affected by the agfl 2597 * header padding mismatch problem. A reset keeps the filesystem online with a 2598 * relatively minor free space accounting inconsistency rather than suffer the 2599 * inevitable crash from use of an invalid agfl block. 2600 */ 2601 static void 2602 xfs_agfl_reset( 2603 struct xfs_trans *tp, 2604 struct xfs_buf *agbp, 2605 struct xfs_perag *pag) 2606 { 2607 struct xfs_mount *mp = tp->t_mountp; 2608 struct xfs_agf *agf = agbp->b_addr; 2609 2610 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2611 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2612 2613 xfs_warn(mp, 2614 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2615 "Please unmount and run xfs_repair.", 2616 pag_agno(pag), pag->pagf_flcount); 2617 2618 agf->agf_flfirst = 0; 2619 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2620 agf->agf_flcount = 0; 2621 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2622 XFS_AGF_FLCOUNT); 2623 2624 pag->pagf_flcount = 0; 2625 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2626 } 2627 2628 /* 2629 * Add the extent to the list of extents to be free at transaction end. 2630 * The list is maintained sorted (by block number). 2631 */ 2632 static int 2633 xfs_defer_extent_free( 2634 struct xfs_trans *tp, 2635 xfs_fsblock_t bno, 2636 xfs_filblks_t len, 2637 const struct xfs_owner_info *oinfo, 2638 enum xfs_ag_resv_type type, 2639 unsigned int free_flags, 2640 struct xfs_defer_pending **dfpp) 2641 { 2642 struct xfs_extent_free_item *xefi; 2643 struct xfs_mount *mp = tp->t_mountp; 2644 2645 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2646 ASSERT(!isnullstartblock(bno)); 2647 ASSERT(!(free_flags & ~XFS_FREE_EXTENT_ALL_FLAGS)); 2648 2649 if (free_flags & XFS_FREE_EXTENT_REALTIME) { 2650 if (type != XFS_AG_RESV_NONE) { 2651 ASSERT(type == XFS_AG_RESV_NONE); 2652 return -EFSCORRUPTED; 2653 } 2654 if (XFS_IS_CORRUPT(mp, !xfs_verify_rtbext(mp, bno, len))) 2655 return -EFSCORRUPTED; 2656 } else { 2657 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) 2658 return -EFSCORRUPTED; 2659 } 2660 2661 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2662 GFP_KERNEL | __GFP_NOFAIL); 2663 xefi->xefi_startblock = bno; 2664 xefi->xefi_blockcount = (xfs_extlen_t)len; 2665 xefi->xefi_agresv = type; 2666 if (free_flags & XFS_FREE_EXTENT_SKIP_DISCARD) 2667 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2668 if (free_flags & XFS_FREE_EXTENT_REALTIME) 2669 xefi->xefi_flags |= XFS_EFI_REALTIME; 2670 if (oinfo) { 2671 ASSERT(oinfo->oi_offset == 0); 2672 2673 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2674 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2675 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2676 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2677 xefi->xefi_owner = oinfo->oi_owner; 2678 } else { 2679 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2680 } 2681 2682 xfs_extent_free_defer_add(tp, xefi, dfpp); 2683 return 0; 2684 } 2685 2686 int 2687 xfs_free_extent_later( 2688 struct xfs_trans *tp, 2689 xfs_fsblock_t bno, 2690 xfs_filblks_t len, 2691 const struct xfs_owner_info *oinfo, 2692 enum xfs_ag_resv_type type, 2693 unsigned int free_flags) 2694 { 2695 struct xfs_defer_pending *dontcare = NULL; 2696 2697 return xfs_defer_extent_free(tp, bno, len, oinfo, type, free_flags, 2698 &dontcare); 2699 } 2700 2701 /* 2702 * Set up automatic freeing of unwritten space in the filesystem. 2703 * 2704 * This function attached a paused deferred extent free item to the 2705 * transaction. Pausing means that the EFI will be logged in the next 2706 * transaction commit, but the pending EFI will not be finished until the 2707 * pending item is unpaused. 2708 * 2709 * If the system goes down after the EFI has been persisted to the log but 2710 * before the pending item is unpaused, log recovery will find the EFI, fail to 2711 * find the EFD, and free the space. 2712 * 2713 * If the pending item is unpaused, the next transaction commit will log an EFD 2714 * without freeing the space. 2715 * 2716 * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the 2717 * @args structure are set to the relevant values. 2718 */ 2719 int 2720 xfs_alloc_schedule_autoreap( 2721 const struct xfs_alloc_arg *args, 2722 unsigned int free_flags, 2723 struct xfs_alloc_autoreap *aarp) 2724 { 2725 int error; 2726 2727 error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, 2728 &args->oinfo, args->resv, free_flags, &aarp->dfp); 2729 if (error) 2730 return error; 2731 2732 xfs_defer_item_pause(args->tp, aarp->dfp); 2733 return 0; 2734 } 2735 2736 /* 2737 * Cancel automatic freeing of unwritten space in the filesystem. 2738 * 2739 * Earlier, we created a paused deferred extent free item and attached it to 2740 * this transaction so that we could automatically roll back a new space 2741 * allocation if the system went down. Now we want to cancel the paused work 2742 * item by marking the EFI stale so we don't actually free the space, unpausing 2743 * the pending item and logging an EFD. 2744 * 2745 * The caller generally should have already mapped the space into the ondisk 2746 * filesystem. If the reserved space was partially used, the caller must call 2747 * xfs_free_extent_later to create a new EFI to free the unused space. 2748 */ 2749 void 2750 xfs_alloc_cancel_autoreap( 2751 struct xfs_trans *tp, 2752 struct xfs_alloc_autoreap *aarp) 2753 { 2754 struct xfs_defer_pending *dfp = aarp->dfp; 2755 struct xfs_extent_free_item *xefi; 2756 2757 if (!dfp) 2758 return; 2759 2760 list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) 2761 xefi->xefi_flags |= XFS_EFI_CANCELLED; 2762 2763 xfs_defer_item_unpause(tp, dfp); 2764 } 2765 2766 /* 2767 * Commit automatic freeing of unwritten space in the filesystem. 2768 * 2769 * This unpauses an earlier _schedule_autoreap and commits to freeing the 2770 * allocated space. Call this if none of the reserved space was used. 2771 */ 2772 void 2773 xfs_alloc_commit_autoreap( 2774 struct xfs_trans *tp, 2775 struct xfs_alloc_autoreap *aarp) 2776 { 2777 if (aarp->dfp) 2778 xfs_defer_item_unpause(tp, aarp->dfp); 2779 } 2780 2781 /* 2782 * Check if an AGF has a free extent record whose length is equal to 2783 * args->minlen. 2784 */ 2785 STATIC int 2786 xfs_exact_minlen_extent_available( 2787 struct xfs_alloc_arg *args, 2788 struct xfs_buf *agbp, 2789 int *stat) 2790 { 2791 struct xfs_btree_cur *cnt_cur; 2792 xfs_agblock_t fbno; 2793 xfs_extlen_t flen; 2794 int error = 0; 2795 2796 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, 2797 args->pag); 2798 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2799 if (error) 2800 goto out; 2801 2802 if (*stat == 0) { 2803 xfs_btree_mark_sick(cnt_cur); 2804 error = -EFSCORRUPTED; 2805 goto out; 2806 } 2807 2808 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2809 if (error) 2810 goto out; 2811 2812 if (*stat == 1 && flen != args->minlen) 2813 *stat = 0; 2814 2815 out: 2816 xfs_btree_del_cursor(cnt_cur, error); 2817 2818 return error; 2819 } 2820 2821 /* 2822 * Decide whether to use this allocation group for this allocation. 2823 * If so, fix up the btree freelist's size. 2824 */ 2825 int /* error */ 2826 xfs_alloc_fix_freelist( 2827 struct xfs_alloc_arg *args, /* allocation argument structure */ 2828 uint32_t alloc_flags) 2829 { 2830 struct xfs_mount *mp = args->mp; 2831 struct xfs_perag *pag = args->pag; 2832 struct xfs_trans *tp = args->tp; 2833 struct xfs_buf *agbp = NULL; 2834 struct xfs_buf *agflbp = NULL; 2835 struct xfs_alloc_arg targs; /* local allocation arguments */ 2836 xfs_agblock_t bno; /* freelist block */ 2837 xfs_extlen_t need; /* total blocks needed in freelist */ 2838 int error = 0; 2839 2840 /* deferred ops (AGFL block frees) require permanent transactions */ 2841 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2842 2843 if (!xfs_perag_initialised_agf(pag)) { 2844 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2845 if (error) { 2846 /* Couldn't lock the AGF so skip this AG. */ 2847 if (error == -EAGAIN) 2848 error = 0; 2849 goto out_no_agbp; 2850 } 2851 } 2852 2853 /* 2854 * If this is a metadata preferred pag and we are user data then try 2855 * somewhere else if we are not being asked to try harder at this 2856 * point 2857 */ 2858 if (xfs_perag_prefers_metadata(pag) && 2859 (args->datatype & XFS_ALLOC_USERDATA) && 2860 (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2861 ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); 2862 goto out_agbp_relse; 2863 } 2864 2865 need = xfs_alloc_min_freelist(mp, pag); 2866 if (!xfs_alloc_space_available(args, need, alloc_flags | 2867 XFS_ALLOC_FLAG_CHECK)) 2868 goto out_agbp_relse; 2869 2870 /* 2871 * Get the a.g. freespace buffer. 2872 * Can fail if we're not blocking on locks, and it's held. 2873 */ 2874 if (!agbp) { 2875 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2876 if (error) { 2877 /* Couldn't lock the AGF so skip this AG. */ 2878 if (error == -EAGAIN) 2879 error = 0; 2880 goto out_no_agbp; 2881 } 2882 } 2883 2884 /* reset a padding mismatched agfl before final free space check */ 2885 if (xfs_perag_agfl_needs_reset(pag)) 2886 xfs_agfl_reset(tp, agbp, pag); 2887 2888 /* If there isn't enough total space or single-extent, reject it. */ 2889 need = xfs_alloc_min_freelist(mp, pag); 2890 if (!xfs_alloc_space_available(args, need, alloc_flags)) 2891 goto out_agbp_relse; 2892 2893 if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { 2894 int stat; 2895 2896 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2897 if (error || !stat) 2898 goto out_agbp_relse; 2899 } 2900 2901 /* 2902 * Make the freelist shorter if it's too long. 2903 * 2904 * Note that from this point onwards, we will always release the agf and 2905 * agfl buffers on error. This handles the case where we error out and 2906 * the buffers are clean or may not have been joined to the transaction 2907 * and hence need to be released manually. If they have been joined to 2908 * the transaction, then xfs_trans_brelse() will handle them 2909 * appropriately based on the recursion count and dirty state of the 2910 * buffer. 2911 * 2912 * XXX (dgc): When we have lots of free space, does this buy us 2913 * anything other than extra overhead when we need to put more blocks 2914 * back on the free list? Maybe we should only do this when space is 2915 * getting low or the AGFL is more than half full? 2916 * 2917 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2918 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2919 * updating the rmapbt. Both flags are used in xfs_repair while we're 2920 * rebuilding the rmapbt, and neither are used by the kernel. They're 2921 * both required to ensure that rmaps are correctly recorded for the 2922 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2923 * repair/rmap.c in xfsprogs for details. 2924 */ 2925 memset(&targs, 0, sizeof(targs)); 2926 /* struct copy below */ 2927 if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) 2928 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2929 else 2930 targs.oinfo = XFS_RMAP_OINFO_AG; 2931 while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && 2932 pag->pagf_flcount > need) { 2933 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2934 if (error) 2935 goto out_agbp_relse; 2936 2937 /* 2938 * Defer the AGFL block free. 2939 * 2940 * This helps to prevent log reservation overruns due to too 2941 * many allocation operations in a transaction. AGFL frees are 2942 * prone to this problem because for one they are always freed 2943 * one at a time. Further, an immediate AGFL block free can 2944 * cause a btree join and require another block free before the 2945 * real allocation can proceed. 2946 * Deferring the free disconnects freeing up the AGFL slot from 2947 * freeing the block. 2948 */ 2949 error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, bno), 2950 1, &targs.oinfo, XFS_AG_RESV_AGFL, 0); 2951 if (error) 2952 goto out_agbp_relse; 2953 } 2954 2955 targs.tp = tp; 2956 targs.mp = mp; 2957 targs.agbp = agbp; 2958 targs.agno = args->agno; 2959 targs.alignment = targs.minlen = targs.prod = 1; 2960 targs.pag = pag; 2961 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2962 if (error) 2963 goto out_agbp_relse; 2964 2965 /* Make the freelist longer if it's too short. */ 2966 while (pag->pagf_flcount < need) { 2967 targs.agbno = 0; 2968 targs.maxlen = need - pag->pagf_flcount; 2969 targs.resv = XFS_AG_RESV_AGFL; 2970 2971 /* Allocate as many blocks as possible at once. */ 2972 error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); 2973 if (error) 2974 goto out_agflbp_relse; 2975 2976 /* 2977 * Stop if we run out. Won't happen if callers are obeying 2978 * the restrictions correctly. Can happen for free calls 2979 * on a completely full ag. 2980 */ 2981 if (targs.agbno == NULLAGBLOCK) { 2982 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 2983 break; 2984 goto out_agflbp_relse; 2985 } 2986 2987 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2988 error = xfs_rmap_alloc(tp, agbp, pag, 2989 targs.agbno, targs.len, &targs.oinfo); 2990 if (error) 2991 goto out_agflbp_relse; 2992 } 2993 error = xfs_alloc_update_counters(tp, agbp, 2994 -((long)(targs.len))); 2995 if (error) 2996 goto out_agflbp_relse; 2997 2998 /* 2999 * Put each allocated block on the list. 3000 */ 3001 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 3002 error = xfs_alloc_put_freelist(pag, tp, agbp, 3003 agflbp, bno, 0); 3004 if (error) 3005 goto out_agflbp_relse; 3006 } 3007 } 3008 xfs_trans_brelse(tp, agflbp); 3009 args->agbp = agbp; 3010 return 0; 3011 3012 out_agflbp_relse: 3013 xfs_trans_brelse(tp, agflbp); 3014 out_agbp_relse: 3015 if (agbp) 3016 xfs_trans_brelse(tp, agbp); 3017 out_no_agbp: 3018 args->agbp = NULL; 3019 return error; 3020 } 3021 3022 /* 3023 * Get a block from the freelist. 3024 * Returns with the buffer for the block gotten. 3025 */ 3026 int 3027 xfs_alloc_get_freelist( 3028 struct xfs_perag *pag, 3029 struct xfs_trans *tp, 3030 struct xfs_buf *agbp, 3031 xfs_agblock_t *bnop, 3032 int btreeblk) 3033 { 3034 struct xfs_agf *agf = agbp->b_addr; 3035 struct xfs_buf *agflbp; 3036 xfs_agblock_t bno; 3037 __be32 *agfl_bno; 3038 int error; 3039 uint32_t logflags; 3040 struct xfs_mount *mp = tp->t_mountp; 3041 3042 /* 3043 * Freelist is empty, give up. 3044 */ 3045 if (!agf->agf_flcount) { 3046 *bnop = NULLAGBLOCK; 3047 return 0; 3048 } 3049 /* 3050 * Read the array of free blocks. 3051 */ 3052 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3053 if (error) 3054 return error; 3055 3056 3057 /* 3058 * Get the block number and update the data structures. 3059 */ 3060 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3061 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 3062 if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) 3063 return -EFSCORRUPTED; 3064 3065 be32_add_cpu(&agf->agf_flfirst, 1); 3066 xfs_trans_brelse(tp, agflbp); 3067 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 3068 agf->agf_flfirst = 0; 3069 3070 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3071 be32_add_cpu(&agf->agf_flcount, -1); 3072 pag->pagf_flcount--; 3073 3074 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 3075 if (btreeblk) { 3076 be32_add_cpu(&agf->agf_btreeblks, 1); 3077 pag->pagf_btreeblks++; 3078 logflags |= XFS_AGF_BTREEBLKS; 3079 } 3080 3081 xfs_alloc_log_agf(tp, agbp, logflags); 3082 *bnop = bno; 3083 3084 return 0; 3085 } 3086 3087 /* 3088 * Log the given fields from the agf structure. 3089 */ 3090 void 3091 xfs_alloc_log_agf( 3092 struct xfs_trans *tp, 3093 struct xfs_buf *bp, 3094 uint32_t fields) 3095 { 3096 int first; /* first byte offset */ 3097 int last; /* last byte offset */ 3098 static const short offsets[] = { 3099 offsetof(xfs_agf_t, agf_magicnum), 3100 offsetof(xfs_agf_t, agf_versionnum), 3101 offsetof(xfs_agf_t, agf_seqno), 3102 offsetof(xfs_agf_t, agf_length), 3103 offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ 3104 offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ 3105 offsetof(xfs_agf_t, agf_flfirst), 3106 offsetof(xfs_agf_t, agf_fllast), 3107 offsetof(xfs_agf_t, agf_flcount), 3108 offsetof(xfs_agf_t, agf_freeblks), 3109 offsetof(xfs_agf_t, agf_longest), 3110 offsetof(xfs_agf_t, agf_btreeblks), 3111 offsetof(xfs_agf_t, agf_uuid), 3112 offsetof(xfs_agf_t, agf_rmap_blocks), 3113 offsetof(xfs_agf_t, agf_refcount_blocks), 3114 offsetof(xfs_agf_t, agf_refcount_root), 3115 offsetof(xfs_agf_t, agf_refcount_level), 3116 /* needed so that we don't log the whole rest of the structure: */ 3117 offsetof(xfs_agf_t, agf_spare64), 3118 sizeof(xfs_agf_t) 3119 }; 3120 3121 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 3122 3123 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 3124 3125 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 3126 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 3127 } 3128 3129 /* 3130 * Put the block on the freelist for the allocation group. 3131 */ 3132 int 3133 xfs_alloc_put_freelist( 3134 struct xfs_perag *pag, 3135 struct xfs_trans *tp, 3136 struct xfs_buf *agbp, 3137 struct xfs_buf *agflbp, 3138 xfs_agblock_t bno, 3139 int btreeblk) 3140 { 3141 struct xfs_mount *mp = tp->t_mountp; 3142 struct xfs_agf *agf = agbp->b_addr; 3143 __be32 *blockp; 3144 int error; 3145 uint32_t logflags; 3146 __be32 *agfl_bno; 3147 int startoff; 3148 3149 if (!agflbp) { 3150 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3151 if (error) 3152 return error; 3153 } 3154 3155 be32_add_cpu(&agf->agf_fllast, 1); 3156 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 3157 agf->agf_fllast = 0; 3158 3159 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3160 be32_add_cpu(&agf->agf_flcount, 1); 3161 pag->pagf_flcount++; 3162 3163 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 3164 if (btreeblk) { 3165 be32_add_cpu(&agf->agf_btreeblks, -1); 3166 pag->pagf_btreeblks--; 3167 logflags |= XFS_AGF_BTREEBLKS; 3168 } 3169 3170 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 3171 3172 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3173 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 3174 *blockp = cpu_to_be32(bno); 3175 startoff = (char *)blockp - (char *)agflbp->b_addr; 3176 3177 xfs_alloc_log_agf(tp, agbp, logflags); 3178 3179 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 3180 xfs_trans_log_buf(tp, agflbp, startoff, 3181 startoff + sizeof(xfs_agblock_t) - 1); 3182 return 0; 3183 } 3184 3185 /* 3186 * Check that this AGF/AGI header's sequence number and length matches the AG 3187 * number and size in fsblocks. 3188 */ 3189 xfs_failaddr_t 3190 xfs_validate_ag_length( 3191 struct xfs_buf *bp, 3192 uint32_t seqno, 3193 uint32_t length) 3194 { 3195 struct xfs_mount *mp = bp->b_mount; 3196 /* 3197 * During growfs operations, the perag is not fully initialised, 3198 * so we can't use it for any useful checking. growfs ensures we can't 3199 * use it by using uncached buffers that don't have the perag attached 3200 * so we can detect and avoid this problem. 3201 */ 3202 if (bp->b_pag && seqno != pag_agno(bp->b_pag)) 3203 return __this_address; 3204 3205 /* 3206 * Only the last AG in the filesystem is allowed to be shorter 3207 * than the AG size recorded in the superblock. 3208 */ 3209 if (length != mp->m_sb.sb_agblocks) { 3210 /* 3211 * During growfs, the new last AG can get here before we 3212 * have updated the superblock. Give it a pass on the seqno 3213 * check. 3214 */ 3215 if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) 3216 return __this_address; 3217 if (length < XFS_MIN_AG_BLOCKS) 3218 return __this_address; 3219 if (length > mp->m_sb.sb_agblocks) 3220 return __this_address; 3221 } 3222 3223 return NULL; 3224 } 3225 3226 /* 3227 * Verify the AGF is consistent. 3228 * 3229 * We do not verify the AGFL indexes in the AGF are fully consistent here 3230 * because of issues with variable on-disk structure sizes. Instead, we check 3231 * the agfl indexes for consistency when we initialise the perag from the AGF 3232 * information after a read completes. 3233 * 3234 * If the index is inconsistent, then we mark the perag as needing an AGFL 3235 * reset. The first AGFL update performed then resets the AGFL indexes and 3236 * refills the AGFL with known good free blocks, allowing the filesystem to 3237 * continue operating normally at the cost of a few leaked free space blocks. 3238 */ 3239 static xfs_failaddr_t 3240 xfs_agf_verify( 3241 struct xfs_buf *bp) 3242 { 3243 struct xfs_mount *mp = bp->b_mount; 3244 struct xfs_agf *agf = bp->b_addr; 3245 xfs_failaddr_t fa; 3246 uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); 3247 uint32_t agf_length = be32_to_cpu(agf->agf_length); 3248 3249 if (xfs_has_crc(mp)) { 3250 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 3251 return __this_address; 3252 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 3253 return __this_address; 3254 } 3255 3256 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 3257 return __this_address; 3258 3259 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) 3260 return __this_address; 3261 3262 /* 3263 * Both agf_seqno and agf_length need to validated before anything else 3264 * block number related in the AGF or AGFL can be checked. 3265 */ 3266 fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); 3267 if (fa) 3268 return fa; 3269 3270 if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) 3271 return __this_address; 3272 if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) 3273 return __this_address; 3274 if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) 3275 return __this_address; 3276 3277 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 3278 be32_to_cpu(agf->agf_freeblks) > agf_length) 3279 return __this_address; 3280 3281 if (be32_to_cpu(agf->agf_bno_level) < 1 || 3282 be32_to_cpu(agf->agf_cnt_level) < 1 || 3283 be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || 3284 be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) 3285 return __this_address; 3286 3287 if (xfs_has_lazysbcount(mp) && 3288 be32_to_cpu(agf->agf_btreeblks) > agf_length) 3289 return __this_address; 3290 3291 if (xfs_has_rmapbt(mp)) { 3292 if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) 3293 return __this_address; 3294 3295 if (be32_to_cpu(agf->agf_rmap_level) < 1 || 3296 be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) 3297 return __this_address; 3298 } 3299 3300 if (xfs_has_reflink(mp)) { 3301 if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) 3302 return __this_address; 3303 3304 if (be32_to_cpu(agf->agf_refcount_level) < 1 || 3305 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) 3306 return __this_address; 3307 } 3308 3309 return NULL; 3310 } 3311 3312 static void 3313 xfs_agf_read_verify( 3314 struct xfs_buf *bp) 3315 { 3316 struct xfs_mount *mp = bp->b_mount; 3317 xfs_failaddr_t fa; 3318 3319 if (xfs_has_crc(mp) && 3320 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 3321 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 3322 else { 3323 fa = xfs_agf_verify(bp); 3324 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 3325 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3326 } 3327 } 3328 3329 static void 3330 xfs_agf_write_verify( 3331 struct xfs_buf *bp) 3332 { 3333 struct xfs_mount *mp = bp->b_mount; 3334 struct xfs_buf_log_item *bip = bp->b_log_item; 3335 struct xfs_agf *agf = bp->b_addr; 3336 xfs_failaddr_t fa; 3337 3338 fa = xfs_agf_verify(bp); 3339 if (fa) { 3340 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3341 return; 3342 } 3343 3344 if (!xfs_has_crc(mp)) 3345 return; 3346 3347 if (bip) 3348 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3349 3350 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3351 } 3352 3353 const struct xfs_buf_ops xfs_agf_buf_ops = { 3354 .name = "xfs_agf", 3355 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3356 .verify_read = xfs_agf_read_verify, 3357 .verify_write = xfs_agf_write_verify, 3358 .verify_struct = xfs_agf_verify, 3359 }; 3360 3361 /* 3362 * Read in the allocation group header (free/alloc section). 3363 */ 3364 int 3365 xfs_read_agf( 3366 struct xfs_perag *pag, 3367 struct xfs_trans *tp, 3368 int flags, 3369 struct xfs_buf **agfbpp) 3370 { 3371 struct xfs_mount *mp = pag_mount(pag); 3372 int error; 3373 3374 trace_xfs_read_agf(pag); 3375 3376 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3377 XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGF_DADDR(mp)), 3378 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3379 if (xfs_metadata_is_sick(error)) 3380 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); 3381 if (error) 3382 return error; 3383 3384 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3385 return 0; 3386 } 3387 3388 /* 3389 * Read in the allocation group header (free/alloc section) and initialise the 3390 * perag structure if necessary. If the caller provides @agfbpp, then return the 3391 * locked buffer to the caller, otherwise free it. 3392 */ 3393 int 3394 xfs_alloc_read_agf( 3395 struct xfs_perag *pag, 3396 struct xfs_trans *tp, 3397 int flags, 3398 struct xfs_buf **agfbpp) 3399 { 3400 struct xfs_mount *mp = pag_mount(pag); 3401 struct xfs_buf *agfbp; 3402 struct xfs_agf *agf; 3403 int error; 3404 int allocbt_blks; 3405 3406 trace_xfs_alloc_read_agf(pag); 3407 3408 /* We don't support trylock when freeing. */ 3409 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3410 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3411 error = xfs_read_agf(pag, tp, 3412 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3413 &agfbp); 3414 if (error) 3415 return error; 3416 3417 agf = agfbp->b_addr; 3418 if (!xfs_perag_initialised_agf(pag)) { 3419 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3420 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3421 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3422 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3423 pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); 3424 pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); 3425 pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); 3426 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3427 if (xfs_agfl_needs_reset(mp, agf)) 3428 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3429 else 3430 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3431 3432 /* 3433 * Update the in-core allocbt counter. Filter out the rmapbt 3434 * subset of the btreeblks counter because the rmapbt is managed 3435 * by perag reservation. Subtract one for the rmapbt root block 3436 * because the rmap counter includes it while the btreeblks 3437 * counter only tracks non-root blocks. 3438 */ 3439 allocbt_blks = pag->pagf_btreeblks; 3440 if (xfs_has_rmapbt(mp)) 3441 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3442 if (allocbt_blks > 0) 3443 atomic64_add(allocbt_blks, &mp->m_allocbt_blks); 3444 3445 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3446 } 3447 #ifdef DEBUG 3448 else if (!xfs_is_shutdown(mp)) { 3449 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3450 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3451 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3452 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3453 ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); 3454 ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); 3455 } 3456 #endif 3457 if (agfbpp) 3458 *agfbpp = agfbp; 3459 else 3460 xfs_trans_brelse(tp, agfbp); 3461 return 0; 3462 } 3463 3464 /* 3465 * Pre-proces allocation arguments to set initial state that we don't require 3466 * callers to set up correctly, as well as bounds check the allocation args 3467 * that are set up. 3468 */ 3469 static int 3470 xfs_alloc_vextent_check_args( 3471 struct xfs_alloc_arg *args, 3472 xfs_fsblock_t target, 3473 xfs_agnumber_t *minimum_agno) 3474 { 3475 struct xfs_mount *mp = args->mp; 3476 xfs_agblock_t agsize; 3477 3478 args->fsbno = NULLFSBLOCK; 3479 3480 *minimum_agno = 0; 3481 if (args->tp->t_highest_agno != NULLAGNUMBER) 3482 *minimum_agno = args->tp->t_highest_agno; 3483 3484 /* 3485 * Just fix this up, for the case where the last a.g. is shorter 3486 * (or there's only one a.g.) and the caller couldn't easily figure 3487 * that out (xfs_bmap_alloc). 3488 */ 3489 agsize = mp->m_sb.sb_agblocks; 3490 if (args->maxlen > agsize) 3491 args->maxlen = agsize; 3492 if (args->alignment == 0) 3493 args->alignment = 1; 3494 3495 ASSERT(args->minlen > 0); 3496 ASSERT(args->maxlen > 0); 3497 ASSERT(args->alignment > 0); 3498 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3499 3500 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3501 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3502 ASSERT(args->minlen <= args->maxlen); 3503 ASSERT(args->minlen <= agsize); 3504 ASSERT(args->mod < args->prod); 3505 3506 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3507 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3508 args->minlen > args->maxlen || args->minlen > agsize || 3509 args->mod >= args->prod) { 3510 trace_xfs_alloc_vextent_badargs(args); 3511 return -ENOSPC; 3512 } 3513 3514 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3515 trace_xfs_alloc_vextent_skip_deadlock(args); 3516 return -ENOSPC; 3517 } 3518 return 0; 3519 3520 } 3521 3522 /* 3523 * Prepare an AG for allocation. If the AG is not prepared to accept the 3524 * allocation, return failure. 3525 * 3526 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3527 * modified to hold their own perag references. 3528 */ 3529 static int 3530 xfs_alloc_vextent_prepare_ag( 3531 struct xfs_alloc_arg *args, 3532 uint32_t alloc_flags) 3533 { 3534 bool need_pag = !args->pag; 3535 int error; 3536 3537 if (need_pag) 3538 args->pag = xfs_perag_get(args->mp, args->agno); 3539 3540 args->agbp = NULL; 3541 error = xfs_alloc_fix_freelist(args, alloc_flags); 3542 if (error) { 3543 trace_xfs_alloc_vextent_nofix(args); 3544 if (need_pag) 3545 xfs_perag_put(args->pag); 3546 args->agbno = NULLAGBLOCK; 3547 return error; 3548 } 3549 if (!args->agbp) { 3550 /* cannot allocate in this AG at all */ 3551 trace_xfs_alloc_vextent_noagbp(args); 3552 args->agbno = NULLAGBLOCK; 3553 return 0; 3554 } 3555 args->wasfromfl = 0; 3556 return 0; 3557 } 3558 3559 /* 3560 * Post-process allocation results to account for the allocation if it succeed 3561 * and set the allocated block number correctly for the caller. 3562 * 3563 * XXX: we should really be returning ENOSPC for ENOSPC, not 3564 * hiding it behind a "successful" NULLFSBLOCK allocation. 3565 */ 3566 static int 3567 xfs_alloc_vextent_finish( 3568 struct xfs_alloc_arg *args, 3569 xfs_agnumber_t minimum_agno, 3570 int alloc_error, 3571 bool drop_perag) 3572 { 3573 struct xfs_mount *mp = args->mp; 3574 int error = 0; 3575 3576 /* 3577 * We can end up here with a locked AGF. If we failed, the caller is 3578 * likely going to try to allocate again with different parameters, and 3579 * that can widen the AGs that are searched for free space. If we have 3580 * to do BMBT block allocation, we have to do a new allocation. 3581 * 3582 * Hence leaving this function with the AGF locked opens up potential 3583 * ABBA AGF deadlocks because a future allocation attempt in this 3584 * transaction may attempt to lock a lower number AGF. 3585 * 3586 * We can't release the AGF until the transaction is commited, so at 3587 * this point we must update the "first allocation" tracker to point at 3588 * this AG if the tracker is empty or points to a lower AG. This allows 3589 * the next allocation attempt to be modified appropriately to avoid 3590 * deadlocks. 3591 */ 3592 if (args->agbp && 3593 (args->tp->t_highest_agno == NULLAGNUMBER || 3594 args->agno > minimum_agno)) 3595 args->tp->t_highest_agno = args->agno; 3596 3597 /* 3598 * If the allocation failed with an error or we had an ENOSPC result, 3599 * preserve the returned error whilst also marking the allocation result 3600 * as "no extent allocated". This ensures that callers that fail to 3601 * capture the error will still treat it as a failed allocation. 3602 */ 3603 if (alloc_error || args->agbno == NULLAGBLOCK) { 3604 args->fsbno = NULLFSBLOCK; 3605 error = alloc_error; 3606 goto out_drop_perag; 3607 } 3608 3609 args->fsbno = xfs_agbno_to_fsb(args->pag, args->agbno); 3610 3611 ASSERT(args->len >= args->minlen); 3612 ASSERT(args->len <= args->maxlen); 3613 ASSERT(args->agbno % args->alignment == 0); 3614 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3615 3616 /* if not file data, insert new block into the reverse map btree */ 3617 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3618 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3619 args->agbno, args->len, &args->oinfo); 3620 if (error) 3621 goto out_drop_perag; 3622 } 3623 3624 if (!args->wasfromfl) { 3625 error = xfs_alloc_update_counters(args->tp, args->agbp, 3626 -((long)(args->len))); 3627 if (error) 3628 goto out_drop_perag; 3629 3630 ASSERT(!xfs_extent_busy_search(pag_group(args->pag), 3631 args->agbno, args->len)); 3632 } 3633 3634 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3635 3636 XFS_STATS_INC(mp, xs_allocx); 3637 XFS_STATS_ADD(mp, xs_allocb, args->len); 3638 3639 trace_xfs_alloc_vextent_finish(args); 3640 3641 out_drop_perag: 3642 if (drop_perag && args->pag) { 3643 xfs_perag_rele(args->pag); 3644 args->pag = NULL; 3645 } 3646 return error; 3647 } 3648 3649 /* 3650 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3651 * you need an exact sized allocation without locality constraints, this is the 3652 * fastest way to do it. 3653 * 3654 * Caller is expected to hold a perag reference in args->pag. 3655 */ 3656 int 3657 xfs_alloc_vextent_this_ag( 3658 struct xfs_alloc_arg *args, 3659 xfs_agnumber_t agno) 3660 { 3661 xfs_agnumber_t minimum_agno; 3662 uint32_t alloc_flags = 0; 3663 int error; 3664 3665 ASSERT(args->pag != NULL); 3666 ASSERT(pag_agno(args->pag) == agno); 3667 3668 args->agno = agno; 3669 args->agbno = 0; 3670 3671 trace_xfs_alloc_vextent_this_ag(args); 3672 3673 error = xfs_alloc_vextent_check_args(args, 3674 xfs_agbno_to_fsb(args->pag, 0), &minimum_agno); 3675 if (error) { 3676 if (error == -ENOSPC) 3677 return 0; 3678 return error; 3679 } 3680 3681 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3682 if (!error && args->agbp) 3683 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3684 3685 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3686 } 3687 3688 /* 3689 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3690 * 3691 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3692 * allocation attempts in @start_agno have locality information. If we fail to 3693 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3694 * we attempt to allocation in as there is no locality optimisation possible for 3695 * those allocations. 3696 * 3697 * On return, args->pag may be left referenced if we finish before the "all 3698 * failed" return point. The allocation finish still needs the perag, and 3699 * so the caller will release it once they've finished the allocation. 3700 * 3701 * When we wrap the AG iteration at the end of the filesystem, we have to be 3702 * careful not to wrap into AGs below ones we already have locked in the 3703 * transaction if we are doing a blocking iteration. This will result in an 3704 * out-of-order locking of AGFs and hence can cause deadlocks. 3705 */ 3706 static int 3707 xfs_alloc_vextent_iterate_ags( 3708 struct xfs_alloc_arg *args, 3709 xfs_agnumber_t minimum_agno, 3710 xfs_agnumber_t start_agno, 3711 xfs_agblock_t target_agbno, 3712 uint32_t alloc_flags) 3713 { 3714 struct xfs_mount *mp = args->mp; 3715 xfs_agnumber_t restart_agno = minimum_agno; 3716 xfs_agnumber_t agno; 3717 int error = 0; 3718 3719 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) 3720 restart_agno = 0; 3721 restart: 3722 for_each_perag_wrap_range(mp, start_agno, restart_agno, 3723 mp->m_sb.sb_agcount, agno, args->pag) { 3724 args->agno = agno; 3725 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3726 if (error) 3727 break; 3728 if (!args->agbp) { 3729 trace_xfs_alloc_vextent_loopfailed(args); 3730 continue; 3731 } 3732 3733 /* 3734 * Allocation is supposed to succeed now, so break out of the 3735 * loop regardless of whether we succeed or not. 3736 */ 3737 if (args->agno == start_agno && target_agbno) { 3738 args->agbno = target_agbno; 3739 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3740 } else { 3741 args->agbno = 0; 3742 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3743 } 3744 break; 3745 } 3746 if (error) { 3747 xfs_perag_rele(args->pag); 3748 args->pag = NULL; 3749 return error; 3750 } 3751 if (args->agbp) 3752 return 0; 3753 3754 /* 3755 * We didn't find an AG we can alloation from. If we were given 3756 * constraining flags by the caller, drop them and retry the allocation 3757 * without any constraints being set. 3758 */ 3759 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { 3760 alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; 3761 restart_agno = minimum_agno; 3762 goto restart; 3763 } 3764 3765 ASSERT(args->pag == NULL); 3766 trace_xfs_alloc_vextent_allfailed(args); 3767 return 0; 3768 } 3769 3770 /* 3771 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3772 * to allocate blocks. It starts with a near allocation attempt in the initial 3773 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3774 * back to zero if allowed by previous allocations in this transaction, 3775 * otherwise will wrap back to the start AG and run a second blocking pass to 3776 * the end of the filesystem. 3777 */ 3778 int 3779 xfs_alloc_vextent_start_ag( 3780 struct xfs_alloc_arg *args, 3781 xfs_fsblock_t target) 3782 { 3783 struct xfs_mount *mp = args->mp; 3784 xfs_agnumber_t minimum_agno; 3785 xfs_agnumber_t start_agno; 3786 xfs_agnumber_t rotorstep = xfs_rotorstep; 3787 bool bump_rotor = false; 3788 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3789 int error; 3790 3791 ASSERT(args->pag == NULL); 3792 3793 args->agno = NULLAGNUMBER; 3794 args->agbno = NULLAGBLOCK; 3795 3796 trace_xfs_alloc_vextent_start_ag(args); 3797 3798 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3799 if (error) { 3800 if (error == -ENOSPC) 3801 return 0; 3802 return error; 3803 } 3804 3805 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3806 xfs_is_inode32(mp)) { 3807 target = XFS_AGB_TO_FSB(mp, 3808 ((mp->m_agfrotor / rotorstep) % 3809 mp->m_sb.sb_agcount), 0); 3810 bump_rotor = 1; 3811 } 3812 3813 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3814 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3815 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3816 3817 if (bump_rotor) { 3818 if (args->agno == start_agno) 3819 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3820 (mp->m_sb.sb_agcount * rotorstep); 3821 else 3822 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3823 (mp->m_sb.sb_agcount * rotorstep); 3824 } 3825 3826 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3827 } 3828 3829 /* 3830 * Iterate from the agno indicated via @target through to the end of the 3831 * filesystem attempting blocking allocation. This does not wrap or try a second 3832 * pass, so will not recurse into AGs lower than indicated by the target. 3833 */ 3834 int 3835 xfs_alloc_vextent_first_ag( 3836 struct xfs_alloc_arg *args, 3837 xfs_fsblock_t target) 3838 { 3839 struct xfs_mount *mp = args->mp; 3840 xfs_agnumber_t minimum_agno; 3841 xfs_agnumber_t start_agno; 3842 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3843 int error; 3844 3845 ASSERT(args->pag == NULL); 3846 3847 args->agno = NULLAGNUMBER; 3848 args->agbno = NULLAGBLOCK; 3849 3850 trace_xfs_alloc_vextent_first_ag(args); 3851 3852 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3853 if (error) { 3854 if (error == -ENOSPC) 3855 return 0; 3856 return error; 3857 } 3858 3859 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3860 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3861 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3862 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3863 } 3864 3865 /* 3866 * Allocate at the exact block target or fail. Caller is expected to hold a 3867 * perag reference in args->pag. 3868 */ 3869 int 3870 xfs_alloc_vextent_exact_bno( 3871 struct xfs_alloc_arg *args, 3872 xfs_fsblock_t target) 3873 { 3874 struct xfs_mount *mp = args->mp; 3875 xfs_agnumber_t minimum_agno; 3876 int error; 3877 3878 ASSERT(args->pag != NULL); 3879 ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); 3880 3881 args->agno = XFS_FSB_TO_AGNO(mp, target); 3882 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3883 3884 trace_xfs_alloc_vextent_exact_bno(args); 3885 3886 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3887 if (error) { 3888 if (error == -ENOSPC) 3889 return 0; 3890 return error; 3891 } 3892 3893 error = xfs_alloc_vextent_prepare_ag(args, 0); 3894 if (!error && args->agbp) 3895 error = xfs_alloc_ag_vextent_exact(args); 3896 3897 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3898 } 3899 3900 /* 3901 * Allocate an extent as close to the target as possible. If there are not 3902 * viable candidates in the AG, then fail the allocation. 3903 * 3904 * Caller may or may not have a per-ag reference in args->pag. 3905 */ 3906 int 3907 xfs_alloc_vextent_near_bno( 3908 struct xfs_alloc_arg *args, 3909 xfs_fsblock_t target) 3910 { 3911 struct xfs_mount *mp = args->mp; 3912 xfs_agnumber_t minimum_agno; 3913 bool needs_perag = args->pag == NULL; 3914 uint32_t alloc_flags = 0; 3915 int error; 3916 3917 if (!needs_perag) 3918 ASSERT(pag_agno(args->pag) == XFS_FSB_TO_AGNO(mp, target)); 3919 3920 args->agno = XFS_FSB_TO_AGNO(mp, target); 3921 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3922 3923 trace_xfs_alloc_vextent_near_bno(args); 3924 3925 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3926 if (error) { 3927 if (error == -ENOSPC) 3928 return 0; 3929 return error; 3930 } 3931 3932 if (needs_perag) 3933 args->pag = xfs_perag_grab(mp, args->agno); 3934 3935 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3936 if (!error && args->agbp) 3937 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3938 3939 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3940 } 3941 3942 /* Ensure that the freelist is at full capacity. */ 3943 int 3944 xfs_free_extent_fix_freelist( 3945 struct xfs_trans *tp, 3946 struct xfs_perag *pag, 3947 struct xfs_buf **agbp) 3948 { 3949 struct xfs_alloc_arg args; 3950 int error; 3951 3952 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3953 args.tp = tp; 3954 args.mp = tp->t_mountp; 3955 args.agno = pag_agno(pag); 3956 args.pag = pag; 3957 3958 /* 3959 * validate that the block number is legal - the enables us to detect 3960 * and handle a silent filesystem corruption rather than crashing. 3961 */ 3962 if (args.agno >= args.mp->m_sb.sb_agcount) 3963 return -EFSCORRUPTED; 3964 3965 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3966 if (error) 3967 return error; 3968 3969 *agbp = args.agbp; 3970 return 0; 3971 } 3972 3973 /* 3974 * Free an extent. 3975 * Just break up the extent address and hand off to xfs_free_ag_extent 3976 * after fixing up the freelist. 3977 */ 3978 int 3979 __xfs_free_extent( 3980 struct xfs_trans *tp, 3981 struct xfs_perag *pag, 3982 xfs_agblock_t agbno, 3983 xfs_extlen_t len, 3984 const struct xfs_owner_info *oinfo, 3985 enum xfs_ag_resv_type type, 3986 bool skip_discard) 3987 { 3988 struct xfs_mount *mp = tp->t_mountp; 3989 struct xfs_buf *agbp; 3990 struct xfs_agf *agf; 3991 int error; 3992 unsigned int busy_flags = 0; 3993 3994 ASSERT(len != 0); 3995 ASSERT(type != XFS_AG_RESV_AGFL); 3996 3997 if (XFS_TEST_ERROR(false, mp, 3998 XFS_ERRTAG_FREE_EXTENT)) 3999 return -EIO; 4000 4001 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 4002 if (error) { 4003 if (xfs_metadata_is_sick(error)) 4004 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 4005 return error; 4006 } 4007 4008 agf = agbp->b_addr; 4009 4010 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 4011 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 4012 error = -EFSCORRUPTED; 4013 goto err_release; 4014 } 4015 4016 /* validate the extent size is legal now we have the agf locked */ 4017 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 4018 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 4019 error = -EFSCORRUPTED; 4020 goto err_release; 4021 } 4022 4023 error = xfs_free_ag_extent(tp, agbp, agbno, len, oinfo, type); 4024 if (error) 4025 goto err_release; 4026 4027 if (skip_discard) 4028 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 4029 xfs_extent_busy_insert(tp, pag_group(pag), agbno, len, busy_flags); 4030 return 0; 4031 4032 err_release: 4033 xfs_trans_brelse(tp, agbp); 4034 return error; 4035 } 4036 4037 struct xfs_alloc_query_range_info { 4038 xfs_alloc_query_range_fn fn; 4039 void *priv; 4040 }; 4041 4042 /* Format btree record and pass to our callback. */ 4043 STATIC int 4044 xfs_alloc_query_range_helper( 4045 struct xfs_btree_cur *cur, 4046 const union xfs_btree_rec *rec, 4047 void *priv) 4048 { 4049 struct xfs_alloc_query_range_info *query = priv; 4050 struct xfs_alloc_rec_incore irec; 4051 xfs_failaddr_t fa; 4052 4053 xfs_alloc_btrec_to_irec(rec, &irec); 4054 fa = xfs_alloc_check_irec(to_perag(cur->bc_group), &irec); 4055 if (fa) 4056 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 4057 4058 return query->fn(cur, &irec, query->priv); 4059 } 4060 4061 /* Find all free space within a given range of blocks. */ 4062 int 4063 xfs_alloc_query_range( 4064 struct xfs_btree_cur *cur, 4065 const struct xfs_alloc_rec_incore *low_rec, 4066 const struct xfs_alloc_rec_incore *high_rec, 4067 xfs_alloc_query_range_fn fn, 4068 void *priv) 4069 { 4070 union xfs_btree_irec low_brec = { .a = *low_rec }; 4071 union xfs_btree_irec high_brec = { .a = *high_rec }; 4072 struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; 4073 4074 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4075 return xfs_btree_query_range(cur, &low_brec, &high_brec, 4076 xfs_alloc_query_range_helper, &query); 4077 } 4078 4079 /* Find all free space records. */ 4080 int 4081 xfs_alloc_query_all( 4082 struct xfs_btree_cur *cur, 4083 xfs_alloc_query_range_fn fn, 4084 void *priv) 4085 { 4086 struct xfs_alloc_query_range_info query; 4087 4088 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4089 query.priv = priv; 4090 query.fn = fn; 4091 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 4092 } 4093 4094 /* 4095 * Scan part of the keyspace of the free space and tell us if the area has no 4096 * records, is fully mapped by records, or is partially filled. 4097 */ 4098 int 4099 xfs_alloc_has_records( 4100 struct xfs_btree_cur *cur, 4101 xfs_agblock_t bno, 4102 xfs_extlen_t len, 4103 enum xbtree_recpacking *outcome) 4104 { 4105 union xfs_btree_irec low; 4106 union xfs_btree_irec high; 4107 4108 memset(&low, 0, sizeof(low)); 4109 low.a.ar_startblock = bno; 4110 memset(&high, 0xFF, sizeof(high)); 4111 high.a.ar_startblock = bno + len - 1; 4112 4113 return xfs_btree_has_records(cur, &low, &high, NULL, outcome); 4114 } 4115 4116 /* 4117 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 4118 * error code or XFS_ITER_*. 4119 */ 4120 int 4121 xfs_agfl_walk( 4122 struct xfs_mount *mp, 4123 struct xfs_agf *agf, 4124 struct xfs_buf *agflbp, 4125 xfs_agfl_walk_fn walk_fn, 4126 void *priv) 4127 { 4128 __be32 *agfl_bno; 4129 unsigned int i; 4130 int error; 4131 4132 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 4133 i = be32_to_cpu(agf->agf_flfirst); 4134 4135 /* Nothing to walk in an empty AGFL. */ 4136 if (agf->agf_flcount == cpu_to_be32(0)) 4137 return 0; 4138 4139 /* Otherwise, walk from first to last, wrapping as needed. */ 4140 for (;;) { 4141 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 4142 if (error) 4143 return error; 4144 if (i == be32_to_cpu(agf->agf_fllast)) 4145 break; 4146 if (++i == xfs_agfl_size(mp)) 4147 i = 0; 4148 } 4149 4150 return 0; 4151 } 4152 4153 int __init 4154 xfs_extfree_intent_init_cache(void) 4155 { 4156 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 4157 sizeof(struct xfs_extent_free_item), 4158 0, 0, NULL); 4159 4160 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 4161 } 4162 4163 void 4164 xfs_extfree_intent_destroy_cache(void) 4165 { 4166 kmem_cache_destroy(xfs_extfree_item_cache); 4167 xfs_extfree_item_cache = NULL; 4168 } 4169