1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 5 * Copyright (C) 2016 - 2020 Christoph Hellwig 6 */ 7 8 #include <linux/init.h> 9 #include <linux/mm.h> 10 #include <linux/slab.h> 11 #include <linux/kmod.h> 12 #include <linux/major.h> 13 #include <linux/device_cgroup.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-integrity.h> 16 #include <linux/backing-dev.h> 17 #include <linux/module.h> 18 #include <linux/blkpg.h> 19 #include <linux/magic.h> 20 #include <linux/buffer_head.h> 21 #include <linux/swap.h> 22 #include <linux/writeback.h> 23 #include <linux/mount.h> 24 #include <linux/pseudo_fs.h> 25 #include <linux/uio.h> 26 #include <linux/namei.h> 27 #include <linux/security.h> 28 #include <linux/part_stat.h> 29 #include <linux/uaccess.h> 30 #include <linux/stat.h> 31 #include "../fs/internal.h" 32 #include "blk.h" 33 34 /* Should we allow writing to mounted block devices? */ 35 static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); 36 37 struct bdev_inode { 38 struct block_device bdev; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bdev_inode *BDEV_I(struct inode *inode) 43 { 44 return container_of(inode, struct bdev_inode, vfs_inode); 45 } 46 47 static inline struct inode *BD_INODE(struct block_device *bdev) 48 { 49 return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode; 50 } 51 52 struct block_device *I_BDEV(struct inode *inode) 53 { 54 return &BDEV_I(inode)->bdev; 55 } 56 EXPORT_SYMBOL(I_BDEV); 57 58 struct block_device *file_bdev(struct file *bdev_file) 59 { 60 return I_BDEV(bdev_file->f_mapping->host); 61 } 62 EXPORT_SYMBOL(file_bdev); 63 64 static void bdev_write_inode(struct block_device *bdev) 65 { 66 struct inode *inode = BD_INODE(bdev); 67 int ret; 68 69 spin_lock(&inode->i_lock); 70 while (inode->i_state & I_DIRTY) { 71 spin_unlock(&inode->i_lock); 72 ret = write_inode_now(inode, true); 73 if (ret) 74 pr_warn_ratelimited( 75 "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", 76 bdev, ret); 77 spin_lock(&inode->i_lock); 78 } 79 spin_unlock(&inode->i_lock); 80 } 81 82 /* Kill _all_ buffers and pagecache , dirty or not.. */ 83 static void kill_bdev(struct block_device *bdev) 84 { 85 struct address_space *mapping = bdev->bd_mapping; 86 87 if (mapping_empty(mapping)) 88 return; 89 90 invalidate_bh_lrus(); 91 truncate_inode_pages(mapping, 0); 92 } 93 94 /* Invalidate clean unused buffers and pagecache. */ 95 void invalidate_bdev(struct block_device *bdev) 96 { 97 struct address_space *mapping = bdev->bd_mapping; 98 99 if (mapping->nrpages) { 100 invalidate_bh_lrus(); 101 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 102 invalidate_mapping_pages(mapping, 0, -1); 103 } 104 } 105 EXPORT_SYMBOL(invalidate_bdev); 106 107 /* 108 * Drop all buffers & page cache for given bdev range. This function bails 109 * with error if bdev has other exclusive owner (such as filesystem). 110 */ 111 int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, 112 loff_t lstart, loff_t lend) 113 { 114 /* 115 * If we don't hold exclusive handle for the device, upgrade to it 116 * while we discard the buffer cache to avoid discarding buffers 117 * under live filesystem. 118 */ 119 if (!(mode & BLK_OPEN_EXCL)) { 120 int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); 121 if (err) 122 goto invalidate; 123 } 124 125 truncate_inode_pages_range(bdev->bd_mapping, lstart, lend); 126 if (!(mode & BLK_OPEN_EXCL)) 127 bd_abort_claiming(bdev, truncate_bdev_range); 128 return 0; 129 130 invalidate: 131 /* 132 * Someone else has handle exclusively open. Try invalidating instead. 133 * The 'end' argument is inclusive so the rounding is safe. 134 */ 135 return invalidate_inode_pages2_range(bdev->bd_mapping, 136 lstart >> PAGE_SHIFT, 137 lend >> PAGE_SHIFT); 138 } 139 140 static void set_init_blocksize(struct block_device *bdev) 141 { 142 unsigned int bsize = bdev_logical_block_size(bdev); 143 loff_t size = i_size_read(BD_INODE(bdev)); 144 145 while (bsize < PAGE_SIZE) { 146 if (size & bsize) 147 break; 148 bsize <<= 1; 149 } 150 BD_INODE(bdev)->i_blkbits = blksize_bits(bsize); 151 mapping_set_folio_min_order(BD_INODE(bdev)->i_mapping, 152 get_order(bsize)); 153 } 154 155 /** 156 * bdev_validate_blocksize - check that this block size is acceptable 157 * @bdev: blockdevice to check 158 * @block_size: block size to check 159 * 160 * For block device users that do not use buffer heads or the block device 161 * page cache, make sure that this block size can be used with the device. 162 * 163 * Return: On success zero is returned, negative error code on failure. 164 */ 165 int bdev_validate_blocksize(struct block_device *bdev, int block_size) 166 { 167 if (blk_validate_block_size(block_size)) 168 return -EINVAL; 169 170 /* Size cannot be smaller than the size supported by the device */ 171 if (block_size < bdev_logical_block_size(bdev)) 172 return -EINVAL; 173 174 return 0; 175 } 176 EXPORT_SYMBOL_GPL(bdev_validate_blocksize); 177 178 int set_blocksize(struct file *file, int size) 179 { 180 struct inode *inode = file->f_mapping->host; 181 struct block_device *bdev = I_BDEV(inode); 182 int ret; 183 184 ret = bdev_validate_blocksize(bdev, size); 185 if (ret) 186 return ret; 187 188 if (!file->private_data) 189 return -EINVAL; 190 191 /* Don't change the size if it is same as current */ 192 if (inode->i_blkbits != blksize_bits(size)) { 193 /* 194 * Flush and truncate the pagecache before we reconfigure the 195 * mapping geometry because folio sizes are variable now. If a 196 * reader has already allocated a folio whose size is smaller 197 * than the new min_order but invokes readahead after the new 198 * min_order becomes visible, readahead will think there are 199 * "zero" blocks per folio and crash. Take the inode and 200 * invalidation locks to avoid racing with 201 * read/write/fallocate. 202 */ 203 inode_lock(inode); 204 filemap_invalidate_lock(inode->i_mapping); 205 206 sync_blockdev(bdev); 207 kill_bdev(bdev); 208 209 inode->i_blkbits = blksize_bits(size); 210 mapping_set_folio_min_order(inode->i_mapping, get_order(size)); 211 kill_bdev(bdev); 212 filemap_invalidate_unlock(inode->i_mapping); 213 inode_unlock(inode); 214 } 215 return 0; 216 } 217 218 EXPORT_SYMBOL(set_blocksize); 219 220 int sb_set_blocksize(struct super_block *sb, int size) 221 { 222 if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) 223 return 0; 224 if (set_blocksize(sb->s_bdev_file, size)) 225 return 0; 226 /* If we get here, we know size is validated */ 227 sb->s_blocksize = size; 228 sb->s_blocksize_bits = blksize_bits(size); 229 return sb->s_blocksize; 230 } 231 232 EXPORT_SYMBOL(sb_set_blocksize); 233 234 int sb_min_blocksize(struct super_block *sb, int size) 235 { 236 int minsize = bdev_logical_block_size(sb->s_bdev); 237 if (size < minsize) 238 size = minsize; 239 return sb_set_blocksize(sb, size); 240 } 241 242 EXPORT_SYMBOL(sb_min_blocksize); 243 244 int sync_blockdev_nowait(struct block_device *bdev) 245 { 246 if (!bdev) 247 return 0; 248 return filemap_flush(bdev->bd_mapping); 249 } 250 EXPORT_SYMBOL_GPL(sync_blockdev_nowait); 251 252 /* 253 * Write out and wait upon all the dirty data associated with a block 254 * device via its mapping. Does not take the superblock lock. 255 */ 256 int sync_blockdev(struct block_device *bdev) 257 { 258 if (!bdev) 259 return 0; 260 return filemap_write_and_wait(bdev->bd_mapping); 261 } 262 EXPORT_SYMBOL(sync_blockdev); 263 264 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) 265 { 266 return filemap_write_and_wait_range(bdev->bd_mapping, 267 lstart, lend); 268 } 269 EXPORT_SYMBOL(sync_blockdev_range); 270 271 /** 272 * bdev_freeze - lock a filesystem and force it into a consistent state 273 * @bdev: blockdevice to lock 274 * 275 * If a superblock is found on this device, we take the s_umount semaphore 276 * on it to make sure nobody unmounts until the snapshot creation is done. 277 * The reference counter (bd_fsfreeze_count) guarantees that only the last 278 * unfreeze process can unfreeze the frozen filesystem actually when multiple 279 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and 280 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze 281 * actually. 282 * 283 * Return: On success zero is returned, negative error code on failure. 284 */ 285 int bdev_freeze(struct block_device *bdev) 286 { 287 int error = 0; 288 289 mutex_lock(&bdev->bd_fsfreeze_mutex); 290 291 if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { 292 mutex_unlock(&bdev->bd_fsfreeze_mutex); 293 return 0; 294 } 295 296 mutex_lock(&bdev->bd_holder_lock); 297 if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { 298 error = bdev->bd_holder_ops->freeze(bdev); 299 lockdep_assert_not_held(&bdev->bd_holder_lock); 300 } else { 301 mutex_unlock(&bdev->bd_holder_lock); 302 error = sync_blockdev(bdev); 303 } 304 305 if (error) 306 atomic_dec(&bdev->bd_fsfreeze_count); 307 308 mutex_unlock(&bdev->bd_fsfreeze_mutex); 309 return error; 310 } 311 EXPORT_SYMBOL(bdev_freeze); 312 313 /** 314 * bdev_thaw - unlock filesystem 315 * @bdev: blockdevice to unlock 316 * 317 * Unlocks the filesystem and marks it writeable again after bdev_freeze(). 318 * 319 * Return: On success zero is returned, negative error code on failure. 320 */ 321 int bdev_thaw(struct block_device *bdev) 322 { 323 int error = -EINVAL, nr_freeze; 324 325 mutex_lock(&bdev->bd_fsfreeze_mutex); 326 327 /* 328 * If this returns < 0 it means that @bd_fsfreeze_count was 329 * already 0 and no decrement was performed. 330 */ 331 nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); 332 if (nr_freeze < 0) 333 goto out; 334 335 error = 0; 336 if (nr_freeze > 0) 337 goto out; 338 339 mutex_lock(&bdev->bd_holder_lock); 340 if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { 341 error = bdev->bd_holder_ops->thaw(bdev); 342 lockdep_assert_not_held(&bdev->bd_holder_lock); 343 } else { 344 mutex_unlock(&bdev->bd_holder_lock); 345 } 346 347 if (error) 348 atomic_inc(&bdev->bd_fsfreeze_count); 349 out: 350 mutex_unlock(&bdev->bd_fsfreeze_mutex); 351 return error; 352 } 353 EXPORT_SYMBOL(bdev_thaw); 354 355 /* 356 * pseudo-fs 357 */ 358 359 static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); 360 static struct kmem_cache *bdev_cachep __ro_after_init; 361 362 static struct inode *bdev_alloc_inode(struct super_block *sb) 363 { 364 struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); 365 366 if (!ei) 367 return NULL; 368 memset(&ei->bdev, 0, sizeof(ei->bdev)); 369 370 if (security_bdev_alloc(&ei->bdev)) { 371 kmem_cache_free(bdev_cachep, ei); 372 return NULL; 373 } 374 return &ei->vfs_inode; 375 } 376 377 static void bdev_free_inode(struct inode *inode) 378 { 379 struct block_device *bdev = I_BDEV(inode); 380 381 free_percpu(bdev->bd_stats); 382 kfree(bdev->bd_meta_info); 383 security_bdev_free(bdev); 384 385 if (!bdev_is_partition(bdev)) { 386 if (bdev->bd_disk && bdev->bd_disk->bdi) 387 bdi_put(bdev->bd_disk->bdi); 388 kfree(bdev->bd_disk); 389 } 390 391 if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) 392 blk_free_ext_minor(MINOR(bdev->bd_dev)); 393 394 kmem_cache_free(bdev_cachep, BDEV_I(inode)); 395 } 396 397 static void init_once(void *data) 398 { 399 struct bdev_inode *ei = data; 400 401 inode_init_once(&ei->vfs_inode); 402 } 403 404 static void bdev_evict_inode(struct inode *inode) 405 { 406 truncate_inode_pages_final(&inode->i_data); 407 invalidate_inode_buffers(inode); /* is it needed here? */ 408 clear_inode(inode); 409 } 410 411 static const struct super_operations bdev_sops = { 412 .statfs = simple_statfs, 413 .alloc_inode = bdev_alloc_inode, 414 .free_inode = bdev_free_inode, 415 .drop_inode = generic_delete_inode, 416 .evict_inode = bdev_evict_inode, 417 }; 418 419 static int bd_init_fs_context(struct fs_context *fc) 420 { 421 struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); 422 if (!ctx) 423 return -ENOMEM; 424 fc->s_iflags |= SB_I_CGROUPWB; 425 ctx->ops = &bdev_sops; 426 return 0; 427 } 428 429 static struct file_system_type bd_type = { 430 .name = "bdev", 431 .init_fs_context = bd_init_fs_context, 432 .kill_sb = kill_anon_super, 433 }; 434 435 struct super_block *blockdev_superblock __ro_after_init; 436 static struct vfsmount *blockdev_mnt __ro_after_init; 437 EXPORT_SYMBOL_GPL(blockdev_superblock); 438 439 void __init bdev_cache_init(void) 440 { 441 int err; 442 443 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 444 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 445 SLAB_ACCOUNT|SLAB_PANIC), 446 init_once); 447 err = register_filesystem(&bd_type); 448 if (err) 449 panic("Cannot register bdev pseudo-fs"); 450 blockdev_mnt = kern_mount(&bd_type); 451 if (IS_ERR(blockdev_mnt)) 452 panic("Cannot create bdev pseudo-fs"); 453 blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ 454 } 455 456 struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) 457 { 458 struct block_device *bdev; 459 struct inode *inode; 460 461 inode = new_inode(blockdev_superblock); 462 if (!inode) 463 return NULL; 464 inode->i_mode = S_IFBLK; 465 inode->i_rdev = 0; 466 inode->i_data.a_ops = &def_blk_aops; 467 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 468 469 bdev = I_BDEV(inode); 470 mutex_init(&bdev->bd_fsfreeze_mutex); 471 spin_lock_init(&bdev->bd_size_lock); 472 mutex_init(&bdev->bd_holder_lock); 473 atomic_set(&bdev->__bd_flags, partno); 474 bdev->bd_mapping = &inode->i_data; 475 bdev->bd_queue = disk->queue; 476 if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO)) 477 bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO); 478 bdev->bd_stats = alloc_percpu(struct disk_stats); 479 if (!bdev->bd_stats) { 480 iput(inode); 481 return NULL; 482 } 483 bdev->bd_disk = disk; 484 return bdev; 485 } 486 487 void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) 488 { 489 spin_lock(&bdev->bd_size_lock); 490 i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT); 491 bdev->bd_nr_sectors = sectors; 492 spin_unlock(&bdev->bd_size_lock); 493 } 494 495 void bdev_add(struct block_device *bdev, dev_t dev) 496 { 497 struct inode *inode = BD_INODE(bdev); 498 if (bdev_stable_writes(bdev)) 499 mapping_set_stable_writes(bdev->bd_mapping); 500 bdev->bd_dev = dev; 501 inode->i_rdev = dev; 502 inode->i_ino = dev; 503 insert_inode_hash(inode); 504 } 505 506 void bdev_unhash(struct block_device *bdev) 507 { 508 remove_inode_hash(BD_INODE(bdev)); 509 } 510 511 void bdev_drop(struct block_device *bdev) 512 { 513 iput(BD_INODE(bdev)); 514 } 515 516 long nr_blockdev_pages(void) 517 { 518 struct inode *inode; 519 long ret = 0; 520 521 spin_lock(&blockdev_superblock->s_inode_list_lock); 522 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) 523 ret += inode->i_mapping->nrpages; 524 spin_unlock(&blockdev_superblock->s_inode_list_lock); 525 526 return ret; 527 } 528 529 /** 530 * bd_may_claim - test whether a block device can be claimed 531 * @bdev: block device of interest 532 * @holder: holder trying to claim @bdev 533 * @hops: holder ops 534 * 535 * Test whether @bdev can be claimed by @holder. 536 * 537 * RETURNS: 538 * %true if @bdev can be claimed, %false otherwise. 539 */ 540 static bool bd_may_claim(struct block_device *bdev, void *holder, 541 const struct blk_holder_ops *hops) 542 { 543 struct block_device *whole = bdev_whole(bdev); 544 545 lockdep_assert_held(&bdev_lock); 546 547 if (bdev->bd_holder) { 548 /* 549 * The same holder can always re-claim. 550 */ 551 if (bdev->bd_holder == holder) { 552 if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) 553 return false; 554 return true; 555 } 556 return false; 557 } 558 559 /* 560 * If the whole devices holder is set to bd_may_claim, a partition on 561 * the device is claimed, but not the whole device. 562 */ 563 if (whole != bdev && 564 whole->bd_holder && whole->bd_holder != bd_may_claim) 565 return false; 566 return true; 567 } 568 569 /** 570 * bd_prepare_to_claim - claim a block device 571 * @bdev: block device of interest 572 * @holder: holder trying to claim @bdev 573 * @hops: holder ops. 574 * 575 * Claim @bdev. This function fails if @bdev is already claimed by another 576 * holder and waits if another claiming is in progress. return, the caller 577 * has ownership of bd_claiming and bd_holder[s]. 578 * 579 * RETURNS: 580 * 0 if @bdev can be claimed, -EBUSY otherwise. 581 */ 582 int bd_prepare_to_claim(struct block_device *bdev, void *holder, 583 const struct blk_holder_ops *hops) 584 { 585 struct block_device *whole = bdev_whole(bdev); 586 587 if (WARN_ON_ONCE(!holder)) 588 return -EINVAL; 589 retry: 590 mutex_lock(&bdev_lock); 591 /* if someone else claimed, fail */ 592 if (!bd_may_claim(bdev, holder, hops)) { 593 mutex_unlock(&bdev_lock); 594 return -EBUSY; 595 } 596 597 /* if claiming is already in progress, wait for it to finish */ 598 if (whole->bd_claiming) { 599 wait_queue_head_t *wq = __var_waitqueue(&whole->bd_claiming); 600 DEFINE_WAIT(wait); 601 602 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 603 mutex_unlock(&bdev_lock); 604 schedule(); 605 finish_wait(wq, &wait); 606 goto retry; 607 } 608 609 /* yay, all mine */ 610 whole->bd_claiming = holder; 611 mutex_unlock(&bdev_lock); 612 return 0; 613 } 614 EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ 615 616 static void bd_clear_claiming(struct block_device *whole, void *holder) 617 { 618 lockdep_assert_held(&bdev_lock); 619 /* tell others that we're done */ 620 BUG_ON(whole->bd_claiming != holder); 621 whole->bd_claiming = NULL; 622 wake_up_var(&whole->bd_claiming); 623 } 624 625 /** 626 * bd_finish_claiming - finish claiming of a block device 627 * @bdev: block device of interest 628 * @holder: holder that has claimed @bdev 629 * @hops: block device holder operations 630 * 631 * Finish exclusive open of a block device. Mark the device as exlusively 632 * open by the holder and wake up all waiters for exclusive open to finish. 633 */ 634 static void bd_finish_claiming(struct block_device *bdev, void *holder, 635 const struct blk_holder_ops *hops) 636 { 637 struct block_device *whole = bdev_whole(bdev); 638 639 mutex_lock(&bdev_lock); 640 BUG_ON(!bd_may_claim(bdev, holder, hops)); 641 /* 642 * Note that for a whole device bd_holders will be incremented twice, 643 * and bd_holder will be set to bd_may_claim before being set to holder 644 */ 645 whole->bd_holders++; 646 whole->bd_holder = bd_may_claim; 647 bdev->bd_holders++; 648 mutex_lock(&bdev->bd_holder_lock); 649 bdev->bd_holder = holder; 650 bdev->bd_holder_ops = hops; 651 mutex_unlock(&bdev->bd_holder_lock); 652 bd_clear_claiming(whole, holder); 653 mutex_unlock(&bdev_lock); 654 } 655 656 /** 657 * bd_abort_claiming - abort claiming of a block device 658 * @bdev: block device of interest 659 * @holder: holder that has claimed @bdev 660 * 661 * Abort claiming of a block device when the exclusive open failed. This can be 662 * also used when exclusive open is not actually desired and we just needed 663 * to block other exclusive openers for a while. 664 */ 665 void bd_abort_claiming(struct block_device *bdev, void *holder) 666 { 667 mutex_lock(&bdev_lock); 668 bd_clear_claiming(bdev_whole(bdev), holder); 669 mutex_unlock(&bdev_lock); 670 } 671 EXPORT_SYMBOL(bd_abort_claiming); 672 673 static void bd_end_claim(struct block_device *bdev, void *holder) 674 { 675 struct block_device *whole = bdev_whole(bdev); 676 bool unblock = false; 677 678 /* 679 * Release a claim on the device. The holder fields are protected with 680 * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. 681 */ 682 mutex_lock(&bdev_lock); 683 WARN_ON_ONCE(bdev->bd_holder != holder); 684 WARN_ON_ONCE(--bdev->bd_holders < 0); 685 WARN_ON_ONCE(--whole->bd_holders < 0); 686 if (!bdev->bd_holders) { 687 mutex_lock(&bdev->bd_holder_lock); 688 bdev->bd_holder = NULL; 689 bdev->bd_holder_ops = NULL; 690 mutex_unlock(&bdev->bd_holder_lock); 691 if (bdev_test_flag(bdev, BD_WRITE_HOLDER)) 692 unblock = true; 693 } 694 if (!whole->bd_holders) 695 whole->bd_holder = NULL; 696 mutex_unlock(&bdev_lock); 697 698 /* 699 * If this was the last claim, remove holder link and unblock evpoll if 700 * it was a write holder. 701 */ 702 if (unblock) { 703 disk_unblock_events(bdev->bd_disk); 704 bdev_clear_flag(bdev, BD_WRITE_HOLDER); 705 } 706 } 707 708 static void blkdev_flush_mapping(struct block_device *bdev) 709 { 710 WARN_ON_ONCE(bdev->bd_holders); 711 sync_blockdev(bdev); 712 kill_bdev(bdev); 713 bdev_write_inode(bdev); 714 } 715 716 static void blkdev_put_whole(struct block_device *bdev) 717 { 718 if (atomic_dec_and_test(&bdev->bd_openers)) 719 blkdev_flush_mapping(bdev); 720 if (bdev->bd_disk->fops->release) 721 bdev->bd_disk->fops->release(bdev->bd_disk); 722 } 723 724 static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) 725 { 726 struct gendisk *disk = bdev->bd_disk; 727 int ret; 728 729 if (disk->fops->open) { 730 ret = disk->fops->open(disk, mode); 731 if (ret) { 732 /* avoid ghost partitions on a removed medium */ 733 if (ret == -ENOMEDIUM && 734 test_bit(GD_NEED_PART_SCAN, &disk->state)) 735 bdev_disk_changed(disk, true); 736 return ret; 737 } 738 } 739 740 if (!atomic_read(&bdev->bd_openers)) 741 set_init_blocksize(bdev); 742 atomic_inc(&bdev->bd_openers); 743 if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { 744 /* 745 * Only return scanning errors if we are called from contexts 746 * that explicitly want them, e.g. the BLKRRPART ioctl. 747 */ 748 ret = bdev_disk_changed(disk, false); 749 if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { 750 blkdev_put_whole(bdev); 751 return ret; 752 } 753 } 754 return 0; 755 } 756 757 static int blkdev_get_part(struct block_device *part, blk_mode_t mode) 758 { 759 struct gendisk *disk = part->bd_disk; 760 int ret; 761 762 ret = blkdev_get_whole(bdev_whole(part), mode); 763 if (ret) 764 return ret; 765 766 ret = -ENXIO; 767 if (!bdev_nr_sectors(part)) 768 goto out_blkdev_put; 769 770 if (!atomic_read(&part->bd_openers)) { 771 disk->open_partitions++; 772 set_init_blocksize(part); 773 } 774 atomic_inc(&part->bd_openers); 775 return 0; 776 777 out_blkdev_put: 778 blkdev_put_whole(bdev_whole(part)); 779 return ret; 780 } 781 782 int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) 783 { 784 int ret; 785 786 ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, 787 MAJOR(dev), MINOR(dev), 788 ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | 789 ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); 790 if (ret) 791 return ret; 792 793 /* Blocking writes requires exclusive opener */ 794 if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) 795 return -EINVAL; 796 797 /* 798 * We're using error pointers to indicate to ->release() when we 799 * failed to open that block device. Also this doesn't make sense. 800 */ 801 if (WARN_ON_ONCE(IS_ERR(holder))) 802 return -EINVAL; 803 804 return 0; 805 } 806 807 static void blkdev_put_part(struct block_device *part) 808 { 809 struct block_device *whole = bdev_whole(part); 810 811 if (atomic_dec_and_test(&part->bd_openers)) { 812 blkdev_flush_mapping(part); 813 whole->bd_disk->open_partitions--; 814 } 815 blkdev_put_whole(whole); 816 } 817 818 struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) 819 { 820 struct block_device *bdev; 821 struct inode *inode; 822 823 inode = ilookup(blockdev_superblock, dev); 824 if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { 825 blk_request_module(dev); 826 inode = ilookup(blockdev_superblock, dev); 827 if (inode) 828 pr_warn_ratelimited( 829 "block device autoloading is deprecated and will be removed.\n"); 830 } 831 if (!inode) 832 return NULL; 833 834 /* switch from the inode reference to a device mode one: */ 835 bdev = &BDEV_I(inode)->bdev; 836 if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) 837 bdev = NULL; 838 iput(inode); 839 return bdev; 840 } 841 842 void blkdev_put_no_open(struct block_device *bdev) 843 { 844 put_device(&bdev->bd_device); 845 } 846 847 static bool bdev_writes_blocked(struct block_device *bdev) 848 { 849 return bdev->bd_writers < 0; 850 } 851 852 static void bdev_block_writes(struct block_device *bdev) 853 { 854 bdev->bd_writers--; 855 } 856 857 static void bdev_unblock_writes(struct block_device *bdev) 858 { 859 bdev->bd_writers++; 860 } 861 862 static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) 863 { 864 if (bdev_allow_write_mounted) 865 return true; 866 /* Writes blocked? */ 867 if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) 868 return false; 869 if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) 870 return false; 871 return true; 872 } 873 874 static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) 875 { 876 if (bdev_allow_write_mounted) 877 return; 878 879 /* Claim exclusive or shared write access. */ 880 if (mode & BLK_OPEN_RESTRICT_WRITES) 881 bdev_block_writes(bdev); 882 else if (mode & BLK_OPEN_WRITE) 883 bdev->bd_writers++; 884 } 885 886 static inline bool bdev_unclaimed(const struct file *bdev_file) 887 { 888 return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); 889 } 890 891 static void bdev_yield_write_access(struct file *bdev_file) 892 { 893 struct block_device *bdev; 894 895 if (bdev_allow_write_mounted) 896 return; 897 898 if (bdev_unclaimed(bdev_file)) 899 return; 900 901 bdev = file_bdev(bdev_file); 902 903 if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) 904 bdev_unblock_writes(bdev); 905 else if (bdev_file->f_mode & FMODE_WRITE) 906 bdev->bd_writers--; 907 } 908 909 /** 910 * bdev_open - open a block device 911 * @bdev: block device to open 912 * @mode: open mode (BLK_OPEN_*) 913 * @holder: exclusive holder identifier 914 * @hops: holder operations 915 * @bdev_file: file for the block device 916 * 917 * Open the block device. If @holder is not %NULL, the block device is opened 918 * with exclusive access. Exclusive opens may nest for the same @holder. 919 * 920 * CONTEXT: 921 * Might sleep. 922 * 923 * RETURNS: 924 * zero on success, -errno on failure. 925 */ 926 int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, 927 const struct blk_holder_ops *hops, struct file *bdev_file) 928 { 929 bool unblock_events = true; 930 struct gendisk *disk = bdev->bd_disk; 931 int ret; 932 933 if (holder) { 934 mode |= BLK_OPEN_EXCL; 935 ret = bd_prepare_to_claim(bdev, holder, hops); 936 if (ret) 937 return ret; 938 } else { 939 if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) 940 return -EIO; 941 } 942 943 disk_block_events(disk); 944 945 mutex_lock(&disk->open_mutex); 946 ret = -ENXIO; 947 if (!disk_live(disk)) 948 goto abort_claiming; 949 if (!try_module_get(disk->fops->owner)) 950 goto abort_claiming; 951 ret = -EBUSY; 952 if (!bdev_may_open(bdev, mode)) 953 goto put_module; 954 if (bdev_is_partition(bdev)) 955 ret = blkdev_get_part(bdev, mode); 956 else 957 ret = blkdev_get_whole(bdev, mode); 958 if (ret) 959 goto put_module; 960 bdev_claim_write_access(bdev, mode); 961 if (holder) { 962 bd_finish_claiming(bdev, holder, hops); 963 964 /* 965 * Block event polling for write claims if requested. Any write 966 * holder makes the write_holder state stick until all are 967 * released. This is good enough and tracking individual 968 * writeable reference is too fragile given the way @mode is 969 * used in blkdev_get/put(). 970 */ 971 if ((mode & BLK_OPEN_WRITE) && 972 !bdev_test_flag(bdev, BD_WRITE_HOLDER) && 973 (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { 974 bdev_set_flag(bdev, BD_WRITE_HOLDER); 975 unblock_events = false; 976 } 977 } 978 mutex_unlock(&disk->open_mutex); 979 980 if (unblock_events) 981 disk_unblock_events(disk); 982 983 bdev_file->f_flags |= O_LARGEFILE; 984 bdev_file->f_mode |= FMODE_CAN_ODIRECT; 985 if (bdev_nowait(bdev)) 986 bdev_file->f_mode |= FMODE_NOWAIT; 987 if (mode & BLK_OPEN_RESTRICT_WRITES) 988 bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; 989 bdev_file->f_mapping = bdev->bd_mapping; 990 bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); 991 bdev_file->private_data = holder; 992 993 return 0; 994 put_module: 995 module_put(disk->fops->owner); 996 abort_claiming: 997 if (holder) 998 bd_abort_claiming(bdev, holder); 999 mutex_unlock(&disk->open_mutex); 1000 disk_unblock_events(disk); 1001 return ret; 1002 } 1003 1004 /* 1005 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk 1006 * associated with the floppy driver where it has allowed ioctls if the 1007 * file was opened for writing, but does not allow reads or writes. 1008 * Make sure that this quirk is reflected in @f_flags. 1009 * 1010 * It can also happen if a block device is opened as O_RDWR | O_WRONLY. 1011 */ 1012 static unsigned blk_to_file_flags(blk_mode_t mode) 1013 { 1014 unsigned int flags = 0; 1015 1016 if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == 1017 (BLK_OPEN_READ | BLK_OPEN_WRITE)) 1018 flags |= O_RDWR; 1019 else if (mode & BLK_OPEN_WRITE_IOCTL) 1020 flags |= O_RDWR | O_WRONLY; 1021 else if (mode & BLK_OPEN_WRITE) 1022 flags |= O_WRONLY; 1023 else if (mode & BLK_OPEN_READ) 1024 flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ 1025 else 1026 WARN_ON_ONCE(true); 1027 1028 if (mode & BLK_OPEN_NDELAY) 1029 flags |= O_NDELAY; 1030 1031 return flags; 1032 } 1033 1034 struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, 1035 const struct blk_holder_ops *hops) 1036 { 1037 struct file *bdev_file; 1038 struct block_device *bdev; 1039 unsigned int flags; 1040 int ret; 1041 1042 ret = bdev_permission(dev, mode, holder); 1043 if (ret) 1044 return ERR_PTR(ret); 1045 1046 bdev = blkdev_get_no_open(dev, true); 1047 if (!bdev) 1048 return ERR_PTR(-ENXIO); 1049 1050 flags = blk_to_file_flags(mode); 1051 bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev), 1052 blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops); 1053 if (IS_ERR(bdev_file)) { 1054 blkdev_put_no_open(bdev); 1055 return bdev_file; 1056 } 1057 ihold(BD_INODE(bdev)); 1058 1059 ret = bdev_open(bdev, mode, holder, hops, bdev_file); 1060 if (ret) { 1061 /* We failed to open the block device. Let ->release() know. */ 1062 bdev_file->private_data = ERR_PTR(ret); 1063 fput(bdev_file); 1064 return ERR_PTR(ret); 1065 } 1066 return bdev_file; 1067 } 1068 EXPORT_SYMBOL(bdev_file_open_by_dev); 1069 1070 struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, 1071 void *holder, 1072 const struct blk_holder_ops *hops) 1073 { 1074 struct file *file; 1075 dev_t dev; 1076 int error; 1077 1078 error = lookup_bdev(path, &dev); 1079 if (error) 1080 return ERR_PTR(error); 1081 1082 file = bdev_file_open_by_dev(dev, mode, holder, hops); 1083 if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) { 1084 if (bdev_read_only(file_bdev(file))) { 1085 fput(file); 1086 file = ERR_PTR(-EACCES); 1087 } 1088 } 1089 1090 return file; 1091 } 1092 EXPORT_SYMBOL(bdev_file_open_by_path); 1093 1094 static inline void bd_yield_claim(struct file *bdev_file) 1095 { 1096 struct block_device *bdev = file_bdev(bdev_file); 1097 void *holder = bdev_file->private_data; 1098 1099 lockdep_assert_held(&bdev->bd_disk->open_mutex); 1100 1101 if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) 1102 return; 1103 1104 if (!bdev_unclaimed(bdev_file)) 1105 bd_end_claim(bdev, holder); 1106 } 1107 1108 void bdev_release(struct file *bdev_file) 1109 { 1110 struct block_device *bdev = file_bdev(bdev_file); 1111 void *holder = bdev_file->private_data; 1112 struct gendisk *disk = bdev->bd_disk; 1113 1114 /* We failed to open that block device. */ 1115 if (IS_ERR(holder)) 1116 goto put_no_open; 1117 1118 /* 1119 * Sync early if it looks like we're the last one. If someone else 1120 * opens the block device between now and the decrement of bd_openers 1121 * then we did a sync that we didn't need to, but that's not the end 1122 * of the world and we want to avoid long (could be several minute) 1123 * syncs while holding the mutex. 1124 */ 1125 if (atomic_read(&bdev->bd_openers) == 1) 1126 sync_blockdev(bdev); 1127 1128 mutex_lock(&disk->open_mutex); 1129 bdev_yield_write_access(bdev_file); 1130 1131 if (holder) 1132 bd_yield_claim(bdev_file); 1133 1134 /* 1135 * Trigger event checking and tell drivers to flush MEDIA_CHANGE 1136 * event. This is to ensure detection of media removal commanded 1137 * from userland - e.g. eject(1). 1138 */ 1139 disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); 1140 1141 if (bdev_is_partition(bdev)) 1142 blkdev_put_part(bdev); 1143 else 1144 blkdev_put_whole(bdev); 1145 mutex_unlock(&disk->open_mutex); 1146 1147 module_put(disk->fops->owner); 1148 put_no_open: 1149 blkdev_put_no_open(bdev); 1150 } 1151 1152 /** 1153 * bdev_fput - yield claim to the block device and put the file 1154 * @bdev_file: open block device 1155 * 1156 * Yield claim on the block device and put the file. Ensure that the 1157 * block device can be reclaimed before the file is closed which is a 1158 * deferred operation. 1159 */ 1160 void bdev_fput(struct file *bdev_file) 1161 { 1162 if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) 1163 return; 1164 1165 if (bdev_file->private_data) { 1166 struct block_device *bdev = file_bdev(bdev_file); 1167 struct gendisk *disk = bdev->bd_disk; 1168 1169 mutex_lock(&disk->open_mutex); 1170 bdev_yield_write_access(bdev_file); 1171 bd_yield_claim(bdev_file); 1172 /* 1173 * Tell release we already gave up our hold on the 1174 * device and if write restrictions are available that 1175 * we already gave up write access to the device. 1176 */ 1177 bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); 1178 mutex_unlock(&disk->open_mutex); 1179 } 1180 1181 fput(bdev_file); 1182 } 1183 EXPORT_SYMBOL(bdev_fput); 1184 1185 /** 1186 * lookup_bdev() - Look up a struct block_device by name. 1187 * @pathname: Name of the block device in the filesystem. 1188 * @dev: Pointer to the block device's dev_t, if found. 1189 * 1190 * Lookup the block device's dev_t at @pathname in the current 1191 * namespace if possible and return it in @dev. 1192 * 1193 * Context: May sleep. 1194 * Return: 0 if succeeded, negative errno otherwise. 1195 */ 1196 int lookup_bdev(const char *pathname, dev_t *dev) 1197 { 1198 struct inode *inode; 1199 struct path path; 1200 int error; 1201 1202 if (!pathname || !*pathname) 1203 return -EINVAL; 1204 1205 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1206 if (error) 1207 return error; 1208 1209 inode = d_backing_inode(path.dentry); 1210 error = -ENOTBLK; 1211 if (!S_ISBLK(inode->i_mode)) 1212 goto out_path_put; 1213 error = -EACCES; 1214 if (!may_open_dev(&path)) 1215 goto out_path_put; 1216 1217 *dev = inode->i_rdev; 1218 error = 0; 1219 out_path_put: 1220 path_put(&path); 1221 return error; 1222 } 1223 EXPORT_SYMBOL(lookup_bdev); 1224 1225 /** 1226 * bdev_mark_dead - mark a block device as dead 1227 * @bdev: block device to operate on 1228 * @surprise: indicate a surprise removal 1229 * 1230 * Tell the file system that this devices or media is dead. If @surprise is set 1231 * to %true the device or media is already gone, if not we are preparing for an 1232 * orderly removal. 1233 * 1234 * This calls into the file system, which then typicall syncs out all dirty data 1235 * and writes back inodes and then invalidates any cached data in the inodes on 1236 * the file system. In addition we also invalidate the block device mapping. 1237 */ 1238 void bdev_mark_dead(struct block_device *bdev, bool surprise) 1239 { 1240 mutex_lock(&bdev->bd_holder_lock); 1241 if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) 1242 bdev->bd_holder_ops->mark_dead(bdev, surprise); 1243 else { 1244 mutex_unlock(&bdev->bd_holder_lock); 1245 sync_blockdev(bdev); 1246 } 1247 1248 invalidate_bdev(bdev); 1249 } 1250 /* 1251 * New drivers should not use this directly. There are some drivers however 1252 * that needs this for historical reasons. For example, the DASD driver has 1253 * historically had a shutdown to offline mode that doesn't actually remove the 1254 * gendisk that otherwise looks a lot like a safe device removal. 1255 */ 1256 EXPORT_SYMBOL_GPL(bdev_mark_dead); 1257 1258 void sync_bdevs(bool wait) 1259 { 1260 struct inode *inode, *old_inode = NULL; 1261 1262 spin_lock(&blockdev_superblock->s_inode_list_lock); 1263 list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { 1264 struct address_space *mapping = inode->i_mapping; 1265 struct block_device *bdev; 1266 1267 spin_lock(&inode->i_lock); 1268 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || 1269 mapping->nrpages == 0) { 1270 spin_unlock(&inode->i_lock); 1271 continue; 1272 } 1273 __iget(inode); 1274 spin_unlock(&inode->i_lock); 1275 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1276 /* 1277 * We hold a reference to 'inode' so it couldn't have been 1278 * removed from s_inodes list while we dropped the 1279 * s_inode_list_lock We cannot iput the inode now as we can 1280 * be holding the last reference and we cannot iput it under 1281 * s_inode_list_lock. So we keep the reference and iput it 1282 * later. 1283 */ 1284 iput(old_inode); 1285 old_inode = inode; 1286 bdev = I_BDEV(inode); 1287 1288 mutex_lock(&bdev->bd_disk->open_mutex); 1289 if (!atomic_read(&bdev->bd_openers)) { 1290 ; /* skip */ 1291 } else if (wait) { 1292 /* 1293 * We keep the error status of individual mapping so 1294 * that applications can catch the writeback error using 1295 * fsync(2). See filemap_fdatawait_keep_errors() for 1296 * details. 1297 */ 1298 filemap_fdatawait_keep_errors(inode->i_mapping); 1299 } else { 1300 filemap_fdatawrite(inode->i_mapping); 1301 } 1302 mutex_unlock(&bdev->bd_disk->open_mutex); 1303 1304 spin_lock(&blockdev_superblock->s_inode_list_lock); 1305 } 1306 spin_unlock(&blockdev_superblock->s_inode_list_lock); 1307 iput(old_inode); 1308 } 1309 1310 /* 1311 * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. 1312 */ 1313 void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) 1314 { 1315 struct block_device *bdev; 1316 1317 /* 1318 * Note that d_backing_inode() returns the block device node inode, not 1319 * the block device's internal inode. Therefore it is *not* valid to 1320 * use I_BDEV() here; the block device has to be looked up by i_rdev 1321 * instead. 1322 */ 1323 bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); 1324 if (!bdev) 1325 return; 1326 1327 if (request_mask & STATX_DIOALIGN) { 1328 stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; 1329 stat->dio_offset_align = bdev_logical_block_size(bdev); 1330 stat->result_mask |= STATX_DIOALIGN; 1331 } 1332 1333 if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) { 1334 struct request_queue *bd_queue = bdev->bd_queue; 1335 1336 generic_fill_statx_atomic_writes(stat, 1337 queue_atomic_write_unit_min_bytes(bd_queue), 1338 queue_atomic_write_unit_max_bytes(bd_queue)); 1339 } 1340 1341 stat->blksize = bdev_io_min(bdev); 1342 1343 blkdev_put_no_open(bdev); 1344 } 1345 1346 bool disk_live(struct gendisk *disk) 1347 { 1348 return !inode_unhashed(BD_INODE(disk->part0)); 1349 } 1350 EXPORT_SYMBOL_GPL(disk_live); 1351 1352 unsigned int block_size(struct block_device *bdev) 1353 { 1354 return 1 << BD_INODE(bdev)->i_blkbits; 1355 } 1356 EXPORT_SYMBOL_GPL(block_size); 1357 1358 static int __init setup_bdev_allow_write_mounted(char *str) 1359 { 1360 if (kstrtobool(str, &bdev_allow_write_mounted)) 1361 pr_warn("Invalid option string for bdev_allow_write_mounted:" 1362 " '%s'\n", str); 1363 return 1; 1364 } 1365 __setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); 1366