1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans.h" 14 #include "xfs_inode.h" 15 #include "xfs_quota.h" 16 #include "xfs_qm.h" 17 #include "xfs_scrub.h" 18 #include "xfs_buf_mem.h" 19 #include "xfs_rmap.h" 20 #include "xfs_exchrange.h" 21 #include "xfs_exchmaps.h" 22 #include "xfs_dir2.h" 23 #include "xfs_parent.h" 24 #include "xfs_icache.h" 25 #include "scrub/scrub.h" 26 #include "scrub/common.h" 27 #include "scrub/trace.h" 28 #include "scrub/repair.h" 29 #include "scrub/health.h" 30 #include "scrub/stats.h" 31 #include "scrub/xfile.h" 32 #include "scrub/tempfile.h" 33 #include "scrub/orphanage.h" 34 35 /* 36 * Online Scrub and Repair 37 * 38 * Traditionally, XFS (the kernel driver) did not know how to check or 39 * repair on-disk data structures. That task was left to the xfs_check 40 * and xfs_repair tools, both of which require taking the filesystem 41 * offline for a thorough but time consuming examination. Online 42 * scrub & repair, on the other hand, enables us to check the metadata 43 * for obvious errors while carefully stepping around the filesystem's 44 * ongoing operations, locking rules, etc. 45 * 46 * Given that most XFS metadata consist of records stored in a btree, 47 * most of the checking functions iterate the btree blocks themselves 48 * looking for irregularities. When a record block is encountered, each 49 * record can be checked for obviously bad values. Record values can 50 * also be cross-referenced against other btrees to look for potential 51 * misunderstandings between pieces of metadata. 52 * 53 * It is expected that the checkers responsible for per-AG metadata 54 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 55 * metadata structure, and perform any relevant cross-referencing before 56 * unlocking the AG and returning the results to userspace. These 57 * scrubbers must not keep an AG locked for too long to avoid tying up 58 * the block and inode allocators. 59 * 60 * Block maps and b-trees rooted in an inode present a special challenge 61 * because they can involve extents from any AG. The general scrubber 62 * structure of lock -> check -> xref -> unlock still holds, but AG 63 * locking order rules /must/ be obeyed to avoid deadlocks. The 64 * ordering rule, of course, is that we must lock in increasing AG 65 * order. Helper functions are provided to track which AG headers we've 66 * already locked. If we detect an imminent locking order violation, we 67 * can signal a potential deadlock, in which case the scrubber can jump 68 * out to the top level, lock all the AGs in order, and retry the scrub. 69 * 70 * For file data (directories, extended attributes, symlinks) scrub, we 71 * can simply lock the inode and walk the data. For btree data 72 * (directories and attributes) we follow the same btree-scrubbing 73 * strategy outlined previously to check the records. 74 * 75 * We use a bit of trickery with transactions to avoid buffer deadlocks 76 * if there is a cycle in the metadata. The basic problem is that 77 * travelling down a btree involves locking the current buffer at each 78 * tree level. If a pointer should somehow point back to a buffer that 79 * we've already examined, we will deadlock due to the second buffer 80 * locking attempt. Note however that grabbing a buffer in transaction 81 * context links the locked buffer to the transaction. If we try to 82 * re-grab the buffer in the context of the same transaction, we avoid 83 * the second lock attempt and continue. Between the verifier and the 84 * scrubber, something will notice that something is amiss and report 85 * the corruption. Therefore, each scrubber will allocate an empty 86 * transaction, attach buffers to it, and cancel the transaction at the 87 * end of the scrub run. Cancelling a non-dirty transaction simply 88 * unlocks the buffers. 89 * 90 * There are four pieces of data that scrub can communicate to 91 * userspace. The first is the error code (errno), which can be used to 92 * communicate operational errors in performing the scrub. There are 93 * also three flags that can be set in the scrub context. If the data 94 * structure itself is corrupt, the CORRUPT flag will be set. If 95 * the metadata is correct but otherwise suboptimal, the PREEN flag 96 * will be set. 97 * 98 * We perform secondary validation of filesystem metadata by 99 * cross-referencing every record with all other available metadata. 100 * For example, for block mapping extents, we verify that there are no 101 * records in the free space and inode btrees corresponding to that 102 * space extent and that there is a corresponding entry in the reverse 103 * mapping btree. Inconsistent metadata is noted by setting the 104 * XCORRUPT flag; btree query function errors are noted by setting the 105 * XFAIL flag and deleting the cursor to prevent further attempts to 106 * cross-reference with a defective btree. 107 * 108 * If a piece of metadata proves corrupt or suboptimal, the userspace 109 * program can ask the kernel to apply some tender loving care (TLC) to 110 * the metadata object by setting the REPAIR flag and re-calling the 111 * scrub ioctl. "Corruption" is defined by metadata violating the 112 * on-disk specification; operations cannot continue if the violation is 113 * left untreated. It is possible for XFS to continue if an object is 114 * "suboptimal", however performance may be degraded. Repairs are 115 * usually performed by rebuilding the metadata entirely out of 116 * redundant metadata. Optimizing, on the other hand, can sometimes be 117 * done without rebuilding entire structures. 118 * 119 * Generally speaking, the repair code has the following code structure: 120 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. 121 * The first check helps us figure out if we need to rebuild or simply 122 * optimize the structure so that the rebuild knows what to do. The 123 * second check evaluates the completeness of the repair; that is what 124 * is reported to userspace. 125 * 126 * A quick note on symbol prefixes: 127 * - "xfs_" are general XFS symbols. 128 * - "xchk_" are symbols related to metadata checking. 129 * - "xrep_" are symbols related to metadata repair. 130 * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS. 131 */ 132 133 /* 134 * Scrub probe -- userspace uses this to probe if we're willing to scrub 135 * or repair a given mountpoint. This will be used by xfs_scrub to 136 * probe the kernel's abilities to scrub (and repair) the metadata. We 137 * do this by validating the ioctl inputs from userspace, preparing the 138 * filesystem for a scrub (or a repair) operation, and immediately 139 * returning to userspace. Userspace can use the returned errno and 140 * structure state to decide (in broad terms) if scrub/repair are 141 * supported by the running kernel. 142 */ 143 static int 144 xchk_probe( 145 struct xfs_scrub *sc) 146 { 147 int error = 0; 148 149 if (xchk_should_terminate(sc, &error)) 150 return error; 151 152 /* 153 * If the caller is probing to see if repair works but repair isn't 154 * built into the kernel, return EOPNOTSUPP because that's the signal 155 * that userspace expects. If online repair is built in, set the 156 * CORRUPT flag (without any of the usual tracing/logging) to force us 157 * into xrep_probe. 158 */ 159 if (xchk_could_repair(sc)) { 160 if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) 161 return -EOPNOTSUPP; 162 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 163 } 164 return 0; 165 } 166 167 /* Scrub setup and teardown */ 168 169 static inline void 170 xchk_fsgates_disable( 171 struct xfs_scrub *sc) 172 { 173 if (!(sc->flags & XCHK_FSGATES_ALL)) 174 return; 175 176 trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL); 177 178 if (sc->flags & XCHK_FSGATES_DRAIN) 179 xfs_defer_drain_wait_disable(); 180 181 if (sc->flags & XCHK_FSGATES_QUOTA) 182 xfs_dqtrx_hook_disable(); 183 184 if (sc->flags & XCHK_FSGATES_DIRENTS) 185 xfs_dir_hook_disable(); 186 187 if (sc->flags & XCHK_FSGATES_RMAP) 188 xfs_rmap_hook_disable(); 189 190 sc->flags &= ~XCHK_FSGATES_ALL; 191 } 192 193 /* Free the resources associated with a scrub subtype. */ 194 void 195 xchk_scrub_free_subord( 196 struct xfs_scrub_subord *sub) 197 { 198 struct xfs_scrub *sc = sub->parent_sc; 199 200 ASSERT(sc->ip == sub->sc.ip); 201 ASSERT(sc->orphanage == sub->sc.orphanage); 202 ASSERT(sc->tempip == sub->sc.tempip); 203 204 sc->sm->sm_type = sub->old_smtype; 205 sc->sm->sm_flags = sub->old_smflags | 206 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT); 207 sc->tp = sub->sc.tp; 208 209 if (sub->sc.buf) { 210 if (sub->sc.buf_cleanup) 211 sub->sc.buf_cleanup(sub->sc.buf); 212 kvfree(sub->sc.buf); 213 } 214 if (sub->sc.xmbtp) 215 xmbuf_free(sub->sc.xmbtp); 216 if (sub->sc.xfile) 217 xfile_destroy(sub->sc.xfile); 218 219 sc->ilock_flags = sub->sc.ilock_flags; 220 sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags; 221 sc->temp_ilock_flags = sub->sc.temp_ilock_flags; 222 223 kfree(sub); 224 } 225 226 /* Free all the resources and finish the transactions. */ 227 STATIC int 228 xchk_teardown( 229 struct xfs_scrub *sc, 230 int error) 231 { 232 xchk_ag_free(sc, &sc->sa); 233 xchk_rtgroup_btcur_free(&sc->sr); 234 235 if (sc->tp) { 236 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 237 error = xfs_trans_commit(sc->tp); 238 else 239 xfs_trans_cancel(sc->tp); 240 sc->tp = NULL; 241 } 242 if (sc->sr.rtg) 243 xchk_rtgroup_free(sc, &sc->sr); 244 if (sc->ip) { 245 if (sc->ilock_flags) 246 xchk_iunlock(sc, sc->ilock_flags); 247 xchk_irele(sc, sc->ip); 248 sc->ip = NULL; 249 } 250 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 251 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 252 mnt_drop_write_file(sc->file); 253 } 254 if (sc->xmbtp) { 255 xmbuf_free(sc->xmbtp); 256 sc->xmbtp = NULL; 257 } 258 if (sc->xfile) { 259 xfile_destroy(sc->xfile); 260 sc->xfile = NULL; 261 } 262 if (sc->buf) { 263 if (sc->buf_cleanup) 264 sc->buf_cleanup(sc->buf); 265 kvfree(sc->buf); 266 sc->buf_cleanup = NULL; 267 sc->buf = NULL; 268 } 269 270 xrep_tempfile_rele(sc); 271 xrep_orphanage_rele(sc); 272 xchk_fsgates_disable(sc); 273 return error; 274 } 275 276 /* Scrubbing dispatch. */ 277 278 static const struct xchk_meta_ops meta_scrub_ops[] = { 279 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ 280 .type = ST_NONE, 281 .setup = xchk_setup_fs, 282 .scrub = xchk_probe, 283 .repair = xrep_probe, 284 }, 285 [XFS_SCRUB_TYPE_SB] = { /* superblock */ 286 .type = ST_PERAG, 287 .setup = xchk_setup_agheader, 288 .scrub = xchk_superblock, 289 .repair = xrep_superblock, 290 }, 291 [XFS_SCRUB_TYPE_AGF] = { /* agf */ 292 .type = ST_PERAG, 293 .setup = xchk_setup_agheader, 294 .scrub = xchk_agf, 295 .repair = xrep_agf, 296 }, 297 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ 298 .type = ST_PERAG, 299 .setup = xchk_setup_agheader, 300 .scrub = xchk_agfl, 301 .repair = xrep_agfl, 302 }, 303 [XFS_SCRUB_TYPE_AGI] = { /* agi */ 304 .type = ST_PERAG, 305 .setup = xchk_setup_agheader, 306 .scrub = xchk_agi, 307 .repair = xrep_agi, 308 }, 309 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ 310 .type = ST_PERAG, 311 .setup = xchk_setup_ag_allocbt, 312 .scrub = xchk_allocbt, 313 .repair = xrep_allocbt, 314 .repair_eval = xrep_revalidate_allocbt, 315 }, 316 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ 317 .type = ST_PERAG, 318 .setup = xchk_setup_ag_allocbt, 319 .scrub = xchk_allocbt, 320 .repair = xrep_allocbt, 321 .repair_eval = xrep_revalidate_allocbt, 322 }, 323 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ 324 .type = ST_PERAG, 325 .setup = xchk_setup_ag_iallocbt, 326 .scrub = xchk_iallocbt, 327 .repair = xrep_iallocbt, 328 .repair_eval = xrep_revalidate_iallocbt, 329 }, 330 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ 331 .type = ST_PERAG, 332 .setup = xchk_setup_ag_iallocbt, 333 .scrub = xchk_iallocbt, 334 .has = xfs_has_finobt, 335 .repair = xrep_iallocbt, 336 .repair_eval = xrep_revalidate_iallocbt, 337 }, 338 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ 339 .type = ST_PERAG, 340 .setup = xchk_setup_ag_rmapbt, 341 .scrub = xchk_rmapbt, 342 .has = xfs_has_rmapbt, 343 .repair = xrep_rmapbt, 344 }, 345 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ 346 .type = ST_PERAG, 347 .setup = xchk_setup_ag_refcountbt, 348 .scrub = xchk_refcountbt, 349 .has = xfs_has_reflink, 350 .repair = xrep_refcountbt, 351 }, 352 [XFS_SCRUB_TYPE_INODE] = { /* inode record */ 353 .type = ST_INODE, 354 .setup = xchk_setup_inode, 355 .scrub = xchk_inode, 356 .repair = xrep_inode, 357 }, 358 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ 359 .type = ST_INODE, 360 .setup = xchk_setup_inode_bmap, 361 .scrub = xchk_bmap_data, 362 .repair = xrep_bmap_data, 363 }, 364 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ 365 .type = ST_INODE, 366 .setup = xchk_setup_inode_bmap, 367 .scrub = xchk_bmap_attr, 368 .repair = xrep_bmap_attr, 369 }, 370 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ 371 .type = ST_INODE, 372 .setup = xchk_setup_inode_bmap, 373 .scrub = xchk_bmap_cow, 374 .repair = xrep_bmap_cow, 375 }, 376 [XFS_SCRUB_TYPE_DIR] = { /* directory */ 377 .type = ST_INODE, 378 .setup = xchk_setup_directory, 379 .scrub = xchk_directory, 380 .repair = xrep_directory, 381 }, 382 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ 383 .type = ST_INODE, 384 .setup = xchk_setup_xattr, 385 .scrub = xchk_xattr, 386 .repair = xrep_xattr, 387 }, 388 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ 389 .type = ST_INODE, 390 .setup = xchk_setup_symlink, 391 .scrub = xchk_symlink, 392 .repair = xrep_symlink, 393 }, 394 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ 395 .type = ST_INODE, 396 .setup = xchk_setup_parent, 397 .scrub = xchk_parent, 398 .repair = xrep_parent, 399 }, 400 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 401 .type = ST_RTGROUP, 402 .has = xfs_has_nonzoned, 403 .setup = xchk_setup_rtbitmap, 404 .scrub = xchk_rtbitmap, 405 .repair = xrep_rtbitmap, 406 }, 407 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 408 .type = ST_RTGROUP, 409 .has = xfs_has_nonzoned, 410 .setup = xchk_setup_rtsummary, 411 .scrub = xchk_rtsummary, 412 .repair = xrep_rtsummary, 413 }, 414 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ 415 .type = ST_FS, 416 .setup = xchk_setup_quota, 417 .scrub = xchk_quota, 418 .repair = xrep_quota, 419 }, 420 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ 421 .type = ST_FS, 422 .setup = xchk_setup_quota, 423 .scrub = xchk_quota, 424 .repair = xrep_quota, 425 }, 426 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ 427 .type = ST_FS, 428 .setup = xchk_setup_quota, 429 .scrub = xchk_quota, 430 .repair = xrep_quota, 431 }, 432 [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ 433 .type = ST_FS, 434 .setup = xchk_setup_fscounters, 435 .scrub = xchk_fscounters, 436 .repair = xrep_fscounters, 437 }, 438 [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ 439 .type = ST_FS, 440 .setup = xchk_setup_quotacheck, 441 .scrub = xchk_quotacheck, 442 .repair = xrep_quotacheck, 443 }, 444 [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ 445 .type = ST_FS, 446 .setup = xchk_setup_nlinks, 447 .scrub = xchk_nlinks, 448 .repair = xrep_nlinks, 449 }, 450 [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ 451 .type = ST_FS, 452 .setup = xchk_setup_fs, 453 .scrub = xchk_health_record, 454 .repair = xrep_notsupported, 455 }, 456 [XFS_SCRUB_TYPE_DIRTREE] = { /* directory tree structure */ 457 .type = ST_INODE, 458 .setup = xchk_setup_dirtree, 459 .scrub = xchk_dirtree, 460 .has = xfs_has_parent, 461 .repair = xrep_dirtree, 462 }, 463 [XFS_SCRUB_TYPE_METAPATH] = { /* metadata directory tree path */ 464 .type = ST_GENERIC, 465 .setup = xchk_setup_metapath, 466 .scrub = xchk_metapath, 467 .has = xfs_has_metadir, 468 .repair = xrep_metapath, 469 }, 470 [XFS_SCRUB_TYPE_RGSUPER] = { /* realtime group superblock */ 471 .type = ST_RTGROUP, 472 .setup = xchk_setup_rgsuperblock, 473 .scrub = xchk_rgsuperblock, 474 .has = xfs_has_rtsb, 475 .repair = xrep_rgsuperblock, 476 }, 477 [XFS_SCRUB_TYPE_RTRMAPBT] = { /* realtime group rmapbt */ 478 .type = ST_RTGROUP, 479 .setup = xchk_setup_rtrmapbt, 480 .scrub = xchk_rtrmapbt, 481 .has = xfs_has_rtrmapbt, 482 .repair = xrep_rtrmapbt, 483 }, 484 [XFS_SCRUB_TYPE_RTREFCBT] = { /* realtime refcountbt */ 485 .type = ST_RTGROUP, 486 .setup = xchk_setup_rtrefcountbt, 487 .scrub = xchk_rtrefcountbt, 488 .has = xfs_has_rtreflink, 489 .repair = xrep_rtrefcountbt, 490 }, 491 }; 492 493 static int 494 xchk_validate_inputs( 495 struct xfs_mount *mp, 496 struct xfs_scrub_metadata *sm) 497 { 498 int error; 499 const struct xchk_meta_ops *ops; 500 501 error = -EINVAL; 502 /* Check our inputs. */ 503 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 504 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 505 goto out; 506 /* sm_reserved[] must be zero */ 507 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 508 goto out; 509 510 error = -ENOENT; 511 /* Do we know about this type of metadata? */ 512 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 513 goto out; 514 ops = &meta_scrub_ops[sm->sm_type]; 515 if (ops->setup == NULL || ops->scrub == NULL) 516 goto out; 517 /* Does this fs even support this type of metadata? */ 518 if (ops->has && !ops->has(mp)) 519 goto out; 520 521 error = -EINVAL; 522 /* restricting fields must be appropriate for type */ 523 switch (ops->type) { 524 case ST_NONE: 525 case ST_FS: 526 if (sm->sm_ino || sm->sm_gen || sm->sm_agno) 527 goto out; 528 break; 529 case ST_PERAG: 530 if (sm->sm_ino || sm->sm_gen || 531 sm->sm_agno >= mp->m_sb.sb_agcount) 532 goto out; 533 break; 534 case ST_INODE: 535 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) 536 goto out; 537 break; 538 case ST_GENERIC: 539 break; 540 case ST_RTGROUP: 541 if (sm->sm_ino || sm->sm_gen) 542 goto out; 543 if (xfs_has_rtgroups(mp)) { 544 /* 545 * On a rtgroups filesystem, there won't be an rtbitmap 546 * or rtsummary file for group 0 unless there's 547 * actually a realtime volume attached. However, older 548 * xfs_scrub always calls the rtbitmap/rtsummary 549 * scrubbers with sm_agno==0 so transform the error 550 * code to ENOENT. 551 */ 552 if (sm->sm_agno >= mp->m_sb.sb_rgcount) { 553 if (sm->sm_agno == 0) 554 error = -ENOENT; 555 goto out; 556 } 557 } else { 558 /* 559 * Prior to rtgroups, the rtbitmap/rtsummary scrubbers 560 * accepted sm_agno==0, so we still accept that for 561 * scrubbing pre-rtgroups filesystems. 562 */ 563 if (sm->sm_agno != 0) 564 goto out; 565 } 566 break; 567 default: 568 goto out; 569 } 570 571 /* No rebuild without repair. */ 572 if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && 573 !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 574 return -EINVAL; 575 576 /* 577 * We only want to repair read-write v5+ filesystems. Defer the check 578 * for ops->repair until after our scrub confirms that we need to 579 * perform repairs so that we avoid failing due to not supporting 580 * repairing an object that doesn't need repairs. 581 */ 582 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 583 error = -EOPNOTSUPP; 584 if (!xfs_has_crc(mp)) 585 goto out; 586 587 error = -EROFS; 588 if (xfs_is_readonly(mp)) 589 goto out; 590 } 591 592 error = 0; 593 out: 594 return error; 595 } 596 597 #ifdef CONFIG_XFS_ONLINE_REPAIR 598 static inline void xchk_postmortem(struct xfs_scrub *sc) 599 { 600 /* 601 * Userspace asked us to repair something, we repaired it, rescanned 602 * it, and the rescan says it's still broken. Scream about this in 603 * the system logs. 604 */ 605 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && 606 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 607 XFS_SCRUB_OFLAG_XCORRUPT))) 608 xrep_failure(sc->mp); 609 } 610 #else 611 static inline void xchk_postmortem(struct xfs_scrub *sc) 612 { 613 /* 614 * Userspace asked us to scrub something, it's broken, and we have no 615 * way of fixing it. Scream in the logs. 616 */ 617 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 618 XFS_SCRUB_OFLAG_XCORRUPT)) 619 xfs_alert_ratelimited(sc->mp, 620 "Corruption detected during scrub."); 621 } 622 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 623 624 /* 625 * Create a new scrub context from an existing one, but with a different scrub 626 * type. 627 */ 628 struct xfs_scrub_subord * 629 xchk_scrub_create_subord( 630 struct xfs_scrub *sc, 631 unsigned int subtype) 632 { 633 struct xfs_scrub_subord *sub; 634 635 sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS); 636 if (!sub) 637 return ERR_PTR(-ENOMEM); 638 639 sub->old_smtype = sc->sm->sm_type; 640 sub->old_smflags = sc->sm->sm_flags; 641 sub->parent_sc = sc; 642 memcpy(&sub->sc, sc, sizeof(struct xfs_scrub)); 643 sub->sc.ops = &meta_scrub_ops[subtype]; 644 sub->sc.sm->sm_type = subtype; 645 sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 646 sub->sc.buf = NULL; 647 sub->sc.buf_cleanup = NULL; 648 sub->sc.xfile = NULL; 649 sub->sc.xmbtp = NULL; 650 651 return sub; 652 } 653 654 /* Dispatch metadata scrubbing. */ 655 STATIC int 656 xfs_scrub_metadata( 657 struct file *file, 658 struct xfs_scrub_metadata *sm) 659 { 660 struct xchk_stats_run run = { }; 661 struct xfs_scrub *sc; 662 struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; 663 u64 check_start; 664 int error = 0; 665 666 BUILD_BUG_ON(sizeof(meta_scrub_ops) != 667 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); 668 669 trace_xchk_start(XFS_I(file_inode(file)), sm, error); 670 671 /* Forbidden if we are shut down or mounted norecovery. */ 672 error = -ESHUTDOWN; 673 if (xfs_is_shutdown(mp)) 674 goto out; 675 error = -ENOTRECOVERABLE; 676 if (xfs_has_norecovery(mp)) 677 goto out; 678 679 error = xchk_validate_inputs(mp, sm); 680 if (error) 681 goto out; 682 683 sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); 684 if (!sc) { 685 error = -ENOMEM; 686 goto out; 687 } 688 689 sc->mp = mp; 690 sc->file = file; 691 sc->sm = sm; 692 sc->ops = &meta_scrub_ops[sm->sm_type]; 693 sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); 694 sc->relax = INIT_XCHK_RELAX; 695 retry_op: 696 /* 697 * When repairs are allowed, prevent freezing or readonly remount while 698 * scrub is running with a real transaction. 699 */ 700 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 701 error = mnt_want_write_file(sc->file); 702 if (error) 703 goto out_sc; 704 705 sc->flags |= XCHK_HAVE_FREEZE_PROT; 706 } 707 708 /* Set up for the operation. */ 709 error = sc->ops->setup(sc); 710 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 711 goto try_harder; 712 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 713 goto need_drain; 714 if (error) 715 goto out_teardown; 716 717 /* Scrub for errors. */ 718 check_start = xchk_stats_now(); 719 if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) 720 error = sc->ops->repair_eval(sc); 721 else 722 error = sc->ops->scrub(sc); 723 run.scrub_ns += xchk_stats_elapsed_ns(check_start); 724 if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) 725 goto try_harder; 726 if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) 727 goto need_drain; 728 if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) 729 goto out_teardown; 730 731 xchk_update_health(sc); 732 733 if (xchk_could_repair(sc)) { 734 /* 735 * If userspace asked for a repair but it wasn't necessary, 736 * report that back to userspace. 737 */ 738 if (!xrep_will_attempt(sc)) { 739 sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; 740 goto out_nofix; 741 } 742 743 /* 744 * If it's broken, userspace wants us to fix it, and we haven't 745 * already tried to fix it, then attempt a repair. 746 */ 747 error = xrep_attempt(sc, &run); 748 if (error == -EAGAIN) { 749 /* 750 * Either the repair function succeeded or it couldn't 751 * get all the resources it needs; either way, we go 752 * back to the beginning and call the scrub function. 753 */ 754 error = xchk_teardown(sc, 0); 755 if (error) { 756 xrep_failure(mp); 757 goto out_sc; 758 } 759 goto retry_op; 760 } 761 } 762 763 out_nofix: 764 xchk_postmortem(sc); 765 out_teardown: 766 error = xchk_teardown(sc, error); 767 out_sc: 768 if (error != -ENOENT) 769 xchk_stats_merge(mp, sm, &run); 770 kfree(sc); 771 out: 772 trace_xchk_done(XFS_I(file_inode(file)), sm, error); 773 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 774 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 775 error = 0; 776 } 777 return error; 778 need_drain: 779 error = xchk_teardown(sc, 0); 780 if (error) 781 goto out_sc; 782 sc->flags |= XCHK_NEED_DRAIN; 783 run.retries++; 784 goto retry_op; 785 try_harder: 786 /* 787 * Scrubbers return -EDEADLOCK to mean 'try harder'. Tear down 788 * everything we hold, then set up again with preparation for 789 * worst-case scenarios. 790 */ 791 error = xchk_teardown(sc, 0); 792 if (error) 793 goto out_sc; 794 sc->flags |= XCHK_TRY_HARDER; 795 run.retries++; 796 goto retry_op; 797 } 798 799 /* Scrub one aspect of one piece of metadata. */ 800 int 801 xfs_ioc_scrub_metadata( 802 struct file *file, 803 void __user *arg) 804 { 805 struct xfs_scrub_metadata scrub; 806 int error; 807 808 if (!capable(CAP_SYS_ADMIN)) 809 return -EPERM; 810 811 if (copy_from_user(&scrub, arg, sizeof(scrub))) 812 return -EFAULT; 813 814 error = xfs_scrub_metadata(file, &scrub); 815 if (error) 816 return error; 817 818 if (copy_to_user(arg, &scrub, sizeof(scrub))) 819 return -EFAULT; 820 821 return 0; 822 } 823 824 /* Decide if there have been any scrub failures up to this point. */ 825 static inline int 826 xfs_scrubv_check_barrier( 827 struct xfs_mount *mp, 828 const struct xfs_scrub_vec *vectors, 829 const struct xfs_scrub_vec *stop_vec) 830 { 831 const struct xfs_scrub_vec *v; 832 __u32 failmask; 833 834 failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT; 835 836 for (v = vectors; v < stop_vec; v++) { 837 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) 838 continue; 839 840 /* 841 * Runtime errors count as a previous failure, except the ones 842 * used to ask userspace to retry. 843 */ 844 switch (v->sv_ret) { 845 case -EBUSY: 846 case -ENOENT: 847 case -EUSERS: 848 case 0: 849 break; 850 default: 851 return -ECANCELED; 852 } 853 854 /* 855 * If any of the out-flags on the scrub vector match the mask 856 * that was set on the barrier vector, that's a previous fail. 857 */ 858 if (v->sv_flags & failmask) 859 return -ECANCELED; 860 } 861 862 return 0; 863 } 864 865 /* 866 * If the caller provided us with a nonzero inode number that isn't the ioctl 867 * file, try to grab a reference to it to eliminate all further untrusted inode 868 * lookups. If we can't get the inode, let each scrub function try again. 869 */ 870 STATIC struct xfs_inode * 871 xchk_scrubv_open_by_handle( 872 struct xfs_mount *mp, 873 const struct xfs_scrub_vec_head *head) 874 { 875 struct xfs_trans *tp; 876 struct xfs_inode *ip; 877 int error; 878 879 error = xfs_trans_alloc_empty(mp, &tp); 880 if (error) 881 return NULL; 882 883 error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); 884 xfs_trans_cancel(tp); 885 if (error) 886 return NULL; 887 888 if (VFS_I(ip)->i_generation != head->svh_gen) { 889 xfs_irele(ip); 890 return NULL; 891 } 892 893 return ip; 894 } 895 896 /* Vectored scrub implementation to reduce ioctl calls. */ 897 int 898 xfs_ioc_scrubv_metadata( 899 struct file *file, 900 void __user *arg) 901 { 902 struct xfs_scrub_vec_head head; 903 struct xfs_scrub_vec_head __user *uhead = arg; 904 struct xfs_scrub_vec *vectors; 905 struct xfs_scrub_vec __user *uvectors; 906 struct xfs_inode *ip_in = XFS_I(file_inode(file)); 907 struct xfs_mount *mp = ip_in->i_mount; 908 struct xfs_inode *handle_ip = NULL; 909 struct xfs_scrub_vec *v; 910 size_t vec_bytes; 911 unsigned int i; 912 int error = 0; 913 914 if (!capable(CAP_SYS_ADMIN)) 915 return -EPERM; 916 917 if (copy_from_user(&head, uhead, sizeof(head))) 918 return -EFAULT; 919 920 if (head.svh_reserved) 921 return -EINVAL; 922 if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL) 923 return -EINVAL; 924 if (head.svh_nr == 0) 925 return 0; 926 927 vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec)); 928 if (vec_bytes > PAGE_SIZE) 929 return -ENOMEM; 930 931 uvectors = u64_to_user_ptr(head.svh_vectors); 932 vectors = memdup_user(uvectors, vec_bytes); 933 if (IS_ERR(vectors)) 934 return PTR_ERR(vectors); 935 936 trace_xchk_scrubv_start(ip_in, &head); 937 938 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 939 if (v->sv_reserved) { 940 error = -EINVAL; 941 goto out_free; 942 } 943 944 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER && 945 (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) { 946 error = -EINVAL; 947 goto out_free; 948 } 949 950 trace_xchk_scrubv_item(mp, &head, i, v); 951 } 952 953 /* 954 * If the caller wants us to do a scrub-by-handle and the file used to 955 * call the ioctl is not the same file, load the incore inode and pin 956 * it across all the scrubv actions to avoid repeated UNTRUSTED 957 * lookups. The reference is not passed to deeper layers of scrub 958 * because each scrubber gets to decide its own strategy and return 959 * values for getting an inode. 960 */ 961 if (head.svh_ino && head.svh_ino != ip_in->i_ino) 962 handle_ip = xchk_scrubv_open_by_handle(mp, &head); 963 964 /* Run all the scrubbers. */ 965 for (i = 0, v = vectors; i < head.svh_nr; i++, v++) { 966 struct xfs_scrub_metadata sm = { 967 .sm_type = v->sv_type, 968 .sm_flags = v->sv_flags, 969 .sm_ino = head.svh_ino, 970 .sm_gen = head.svh_gen, 971 .sm_agno = head.svh_agno, 972 }; 973 974 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) { 975 v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v); 976 if (v->sv_ret) { 977 trace_xchk_scrubv_barrier_fail(mp, &head, i, v); 978 break; 979 } 980 981 continue; 982 } 983 984 v->sv_ret = xfs_scrub_metadata(file, &sm); 985 v->sv_flags = sm.sm_flags; 986 987 trace_xchk_scrubv_outcome(mp, &head, i, v); 988 989 if (head.svh_rest_us) { 990 ktime_t expires; 991 992 expires = ktime_add_ns(ktime_get(), 993 head.svh_rest_us * 1000); 994 set_current_state(TASK_KILLABLE); 995 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 996 } 997 998 if (fatal_signal_pending(current)) { 999 error = -EINTR; 1000 goto out_free; 1001 } 1002 } 1003 1004 if (copy_to_user(uvectors, vectors, vec_bytes) || 1005 copy_to_user(uhead, &head, sizeof(head))) { 1006 error = -EFAULT; 1007 goto out_free; 1008 } 1009 1010 out_free: 1011 if (handle_ip) 1012 xfs_irele(handle_ip); 1013 kfree(vectors); 1014 return error; 1015 } 1016