Lines Matching +full:write +full:- +full:to +full:- +full:write
1 // SPDX-License-Identifier: GPL-2.0
15 #include <linux/blk-mq.h>
21 #include "blk-mq-sched.h"
22 #include "blk-mq-debugfs.h"
38 * Per-zone write plug.
40 * @ref: Zone write plug reference counter. A zone write plug reference is
43 * submitted and when a function needs to manipulate a plug. The
46 * reference is dropped whenever the zone of the zone write plug is reset,
47 * finished and when the zone becomes full (last write BIO to the zone
49 * @lock: Spinlock to atomically manipulate the plug.
52 * @wp_offset: The zone write pointer location relative to the start of the zone
55 * @bio_work: Work struct to handle issuing of plugged BIOs
56 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
57 * @disk: The gendisk the plug belongs to.
73 * Zone write plug flags bits:
74 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
75 * that is, that write BIOs are being throttled due to a write BIO already
76 * being executed or the zone write plug bio list is not empty.
77 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
78 * write pointer offset and need to update it.
79 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
80 * from the disk hash table and that the initial reference to the zone
81 * write plug set when the plug was first added to the hash table has been
83 * to prevent new references to the zone write plug to be taken for
84 * newly incoming BIOs. A zone write plug flagged with this flag will be
92 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
95 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
123 struct gendisk *disk = args->disk; in disk_report_zones_cb()
125 if (disk->zone_wplugs_hash) in disk_report_zones_cb()
128 if (!args->user_cb) in disk_report_zones_cb()
131 return args->user_cb(zone, idx, args->user_data); in disk_report_zones_cb()
135 * blkdev_report_zones - Get zones information
137 * @sector: Sector from which to report zones
138 * @nr_zones: Maximum number of zones to report
145 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
146 * constant can be passed to @nr_zones.
150 * Note: The caller must use memalloc_noXX_save/restore() calls to control
156 struct gendisk *disk = bdev->bd_disk; in blkdev_report_zones()
164 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) in blkdev_report_zones()
165 return -EOPNOTSUPP; in blkdev_report_zones()
170 return disk->fops->report_zones(disk, sector, nr_zones, in blkdev_report_zones()
184 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
186 * @op: Operation to be performed on the zones
187 * @sector: Start sector of the first zone to operate on
195 * The operation to execute on each zone can be a zone reset, open, close
208 return -EOPNOTSUPP; in blkdev_zone_mgmt()
211 return -EPERM; in blkdev_zone_mgmt()
214 return -EOPNOTSUPP; in blkdev_zone_mgmt()
218 return -EINVAL; in blkdev_zone_mgmt()
222 return -EINVAL; in blkdev_zone_mgmt()
225 return -EINVAL; in blkdev_zone_mgmt()
236 bio->bi_iter.bi_sector = sector; in blkdev_zone_mgmt()
239 /* This may take a while, so be nice to others */ in blkdev_zone_mgmt()
259 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) in blkdev_copy_zone_to_user()
260 return -EFAULT; in blkdev_copy_zone_to_user()
277 return -EINVAL; in blkdev_report_zones_ioctl()
280 return -ENOTTY; in blkdev_report_zones_ioctl()
283 return -EFAULT; in blkdev_report_zones_ioctl()
286 return -EINVAL; in blkdev_report_zones_ioctl()
297 return -EFAULT; in blkdev_report_zones_ioctl()
306 if (zrange->sector + zrange->nr_sectors <= zrange->sector || in blkdev_truncate_zone_range()
307 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) in blkdev_truncate_zone_range()
309 return -EINVAL; in blkdev_truncate_zone_range()
311 start = zrange->sector << SECTOR_SHIFT; in blkdev_truncate_zone_range()
312 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; in blkdev_truncate_zone_range()
330 return -EINVAL; in blkdev_zone_mgmt_ioctl()
333 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
336 return -EBADF; in blkdev_zone_mgmt_ioctl()
339 return -EFAULT; in blkdev_zone_mgmt_ioctl()
346 inode_lock(bdev->bd_mapping->host); in blkdev_zone_mgmt_ioctl()
347 filemap_invalidate_lock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
362 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
369 filemap_invalidate_unlock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
370 inode_unlock(bdev->bd_mapping->host); in blkdev_zone_mgmt_ioctl()
378 return zone->start + zone->len >= get_capacity(disk); in disk_zone_is_last()
384 if (zno < disk->nr_zones - 1) in disk_zone_is_full()
385 return offset_in_zone >= disk->zone_capacity; in disk_zone_is_full()
386 return offset_in_zone >= disk->last_zone_capacity; in disk_zone_is_full()
392 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); in disk_zone_wplug_is_full()
401 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); in disk_insert_zone_wplug()
404 * Add the new zone write plug to the hash table, but carefully as we in disk_insert_zone_wplug()
406 * zone write plug for the same zone. in disk_insert_zone_wplug()
408 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
409 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { in disk_insert_zone_wplug()
410 if (zwplg->zone_no == zwplug->zone_no) { in disk_insert_zone_wplug()
411 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
415 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); in disk_insert_zone_wplug()
416 atomic_inc(&disk->nr_zone_wplugs); in disk_insert_zone_wplug()
417 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
426 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); in disk_get_hashed_zone_wplug()
431 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { in disk_get_hashed_zone_wplug()
432 if (zwplug->zone_no == zno && in disk_get_hashed_zone_wplug()
433 refcount_inc_not_zero(&zwplug->ref)) { in disk_get_hashed_zone_wplug()
447 if (!atomic_read(&disk->nr_zone_wplugs)) in disk_get_zone_wplug()
458 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); in disk_free_zone_wplug_rcu()
463 if (refcount_dec_and_test(&zwplug->ref)) { in disk_put_zone_wplug()
464 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); in disk_put_zone_wplug()
465 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); in disk_put_zone_wplug()
466 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); in disk_put_zone_wplug()
468 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); in disk_put_zone_wplug()
475 lockdep_assert_held(&zwplug->lock); in disk_should_remove_zone_wplug()
477 /* If the zone write plug was already removed, we are done. */ in disk_should_remove_zone_wplug()
478 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_should_remove_zone_wplug()
481 /* If the zone write plug is still plugged, it cannot be removed. */ in disk_should_remove_zone_wplug()
482 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) in disk_should_remove_zone_wplug()
490 * should not attempt to remove the zone write plug until all BIO in disk_should_remove_zone_wplug()
491 * completions are seen. Check by looking at the zone write plug in disk_should_remove_zone_wplug()
496 if (refcount_read(&zwplug->ref) > 2) in disk_should_remove_zone_wplug()
499 /* We can remove zone write plugs for zones that are empty or full. */ in disk_should_remove_zone_wplug()
500 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); in disk_should_remove_zone_wplug()
508 /* If the zone write plug was already removed, we have nothing to do. */ in disk_remove_zone_wplug()
509 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_remove_zone_wplug()
513 * Mark the zone write plug as unhashed and drop the extra reference we in disk_remove_zone_wplug()
516 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; in disk_remove_zone_wplug()
517 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
518 hlist_del_init_rcu(&zwplug->node); in disk_remove_zone_wplug()
519 atomic_dec(&disk->nr_zone_wplugs); in disk_remove_zone_wplug()
520 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
527 * Get a reference on the write plug for the zone containing @sector.
529 * Return a pointer to the zone write plug with the plug spinlock held.
543 * operation has not already removed the zone write plug from in disk_get_and_lock_zone_wplug()
545 * we need to get a new plug so start over from the beginning. in disk_get_and_lock_zone_wplug()
547 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
548 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { in disk_get_and_lock_zone_wplug()
549 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
557 * Allocate and initialize a zone write plug with an extra reference in disk_get_and_lock_zone_wplug()
558 * so that it is not freed when the zone write plug becomes idle without in disk_get_and_lock_zone_wplug()
561 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); in disk_get_and_lock_zone_wplug()
565 INIT_HLIST_NODE(&zwplug->node); in disk_get_and_lock_zone_wplug()
566 refcount_set(&zwplug->ref, 2); in disk_get_and_lock_zone_wplug()
567 spin_lock_init(&zwplug->lock); in disk_get_and_lock_zone_wplug()
568 zwplug->flags = 0; in disk_get_and_lock_zone_wplug()
569 zwplug->zone_no = zno; in disk_get_and_lock_zone_wplug()
570 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); in disk_get_and_lock_zone_wplug()
571 bio_list_init(&zwplug->bio_list); in disk_get_and_lock_zone_wplug()
572 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); in disk_get_and_lock_zone_wplug()
573 zwplug->disk = disk; in disk_get_and_lock_zone_wplug()
575 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
578 * Insert the new zone write plug in the hash table. This can fail only in disk_get_and_lock_zone_wplug()
583 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
584 mempool_free(zwplug, disk->zone_wplugs_pool); in disk_get_and_lock_zone_wplug()
594 struct request_queue *q = zwplug->disk->queue; in blk_zone_wplug_bio_io_error()
604 * Abort (fail) all plugged BIOs of a zone write plug.
610 if (bio_list_empty(&zwplug->bio_list)) in disk_zone_wplug_abort()
614 zwplug->disk->disk_name, zwplug->zone_no); in disk_zone_wplug_abort()
615 while ((bio = bio_list_pop(&zwplug->bio_list))) in disk_zone_wplug_abort()
620 * Set a zone write plug write pointer offset to the specified value.
623 * update from a report zone after a write error.
629 lockdep_assert_held(&zwplug->lock); in disk_zone_wplug_set_wp_offset()
631 /* Update the zone write pointer and abort all plugged BIOs. */ in disk_zone_wplug_set_wp_offset()
632 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; in disk_zone_wplug_set_wp_offset()
633 zwplug->wp_offset = wp_offset; in disk_zone_wplug_set_wp_offset()
637 * The zone write plug now has no BIO plugged: remove it from the in disk_zone_wplug_set_wp_offset()
647 switch (zone->cond) { in blk_zone_wp_offset()
651 return zone->wp - zone->start; in blk_zone_wp_offset()
653 return zone->len; in blk_zone_wp_offset()
661 * Conventional, offline and read-only zones do not have a valid in blk_zone_wp_offset()
662 * write pointer. in blk_zone_wp_offset()
674 zwplug = disk_get_zone_wplug(disk, zone->start); in disk_zone_wplug_sync_wp_offset()
678 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_sync_wp_offset()
679 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) in disk_zone_wplug_sync_wp_offset()
682 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_sync_wp_offset()
693 return disk->fops->report_zones(disk, sector, 1, in disk_zone_sync_wp_offset()
700 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_or_finish()
701 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_reset_or_finish()
706 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { in blk_zone_wplug_handle_reset_or_finish()
712 * No-wait reset or finish BIOs do not make much sense as the callers in blk_zone_wplug_handle_reset_or_finish()
713 * issue these as blocking operations in most cases. To avoid issues in blk_zone_wplug_handle_reset_or_finish()
717 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) in blk_zone_wplug_handle_reset_or_finish()
718 bio->bi_opf &= ~REQ_NOWAIT; in blk_zone_wplug_handle_reset_or_finish()
721 * If we have a zone write plug, set its write pointer offset to 0 in blk_zone_wplug_handle_reset_or_finish()
722 * (reset case) or to the zone size (finish case). This will abort all in blk_zone_wplug_handle_reset_or_finish()
724 * finishing zones while writes are still in-flight will result in the in blk_zone_wplug_handle_reset_or_finish()
729 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_or_finish()
731 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_or_finish()
740 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_all()
746 * Set the write pointer offset of all zone write plugs to 0. This will in blk_zone_wplug_handle_reset_all()
748 * are still in-flight will result in the writes failing anyway. in blk_zone_wplug_handle_reset_all()
751 sector += disk->queue->limits.chunk_sectors) { in blk_zone_wplug_handle_reset_all()
754 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_all()
756 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_all()
768 * Take a reference on the zone write plug and schedule the submission in disk_zone_wplug_schedule_bio_work()
772 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); in disk_zone_wplug_schedule_bio_work()
773 refcount_inc(&zwplug->ref); in disk_zone_wplug_schedule_bio_work()
774 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); in disk_zone_wplug_schedule_bio_work()
785 * This reference will be reused to submit a request for the BIO for in disk_zone_wplug_add_bio()
786 * blk-mq devices and dropped when the BIO is failed and after in disk_zone_wplug_add_bio()
787 * it is issued in the case of BIO-based devices. in disk_zone_wplug_add_bio()
789 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); in disk_zone_wplug_add_bio()
792 * The BIO is being plugged and thus will have to wait for the on-going in disk_zone_wplug_add_bio()
793 * write and for all other writes already plugged. So polling makes in disk_zone_wplug_add_bio()
799 * REQ_NOWAIT BIOs are always handled using the zone write plug BIO in disk_zone_wplug_add_bio()
803 if (bio->bi_opf & REQ_NOWAIT) { in disk_zone_wplug_add_bio()
804 schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); in disk_zone_wplug_add_bio()
805 bio->bi_opf &= ~REQ_NOWAIT; in disk_zone_wplug_add_bio()
809 * Reuse the poll cookie field to store the number of segments when in disk_zone_wplug_add_bio()
810 * split to the hardware limits. in disk_zone_wplug_add_bio()
812 bio->__bi_nr_segments = nr_segs; in disk_zone_wplug_add_bio()
815 * We always receive BIOs after they are split and ready to be issued. in disk_zone_wplug_add_bio()
817 * user must also issue write sequentially. So simply add the new BIO in disk_zone_wplug_add_bio()
818 * at the tail of the list to preserve the sequential write order. in disk_zone_wplug_add_bio()
820 bio_list_add(&zwplug->bio_list, bio); in disk_zone_wplug_add_bio()
822 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_add_bio()
838 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). in blk_zone_write_plug_bio_merged()
839 * For this case, we already hold a reference on the zone write plug for in blk_zone_write_plug_bio_merged()
841 * zone write pointer offset update. in blk_zone_write_plug_bio_merged()
849 * Get a reference on the zone write plug of the target zone and advance in blk_zone_write_plug_bio_merged()
850 * the zone write pointer offset. Given that this is a merge, we already in blk_zone_write_plug_bio_merged()
851 * have at least one request and one BIO referencing the zone write in blk_zone_write_plug_bio_merged()
854 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, in blk_zone_write_plug_bio_merged()
855 bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_merged()
859 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
860 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_bio_merged()
861 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
865 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
866 * already went through zone write plugging (either a new BIO or one that was
872 struct request_queue *q = req->q; in blk_zone_write_plug_init_request()
873 struct gendisk *disk = q->disk; in blk_zone_write_plug_init_request()
883 * Indicate that completion of this request needs to be handled with in blk_zone_write_plug_init_request()
885 * on the zone write plug we took above on entry to this function. in blk_zone_write_plug_init_request()
887 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_init_request()
893 * Walk through the list of plugged BIOs to check if they can be merged in blk_zone_write_plug_init_request()
896 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
898 bio = bio_list_peek(&zwplug->bio_list); in blk_zone_write_plug_init_request()
902 if (bio->bi_iter.bi_sector != req_back_sector || in blk_zone_write_plug_init_request()
907 !bio->__bi_nr_segments); in blk_zone_write_plug_init_request()
909 bio_list_pop(&zwplug->bio_list); in blk_zone_write_plug_init_request()
910 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != in blk_zone_write_plug_init_request()
912 bio_list_add_head(&zwplug->bio_list, bio); in blk_zone_write_plug_init_request()
918 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_init_request()
922 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
926 * Check and prepare a BIO for submission by incrementing the write pointer
927 * offset of its zone write plug and changing zone append operations into
928 * regular write when zone append emulation is needed.
933 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_prepare_bio()
935 lockdep_assert_held(&zwplug->lock); in blk_zone_wplug_prepare_bio()
938 * If we lost track of the zone write pointer due to a write error, in blk_zone_wplug_prepare_bio()
940 * the to recover a reliable write pointer position. Fail BIOs if the in blk_zone_wplug_prepare_bio()
944 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) in blk_zone_wplug_prepare_bio()
948 * Check that the user is not attempting to write to a full zone. in blk_zone_wplug_prepare_bio()
950 * write pointer offset beyond the end of the zone. in blk_zone_wplug_prepare_bio()
957 * Use a regular write starting at the current write pointer. in blk_zone_wplug_prepare_bio()
958 * Similarly to native zone append operations, do not allow in blk_zone_wplug_prepare_bio()
961 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_wplug_prepare_bio()
962 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; in blk_zone_wplug_prepare_bio()
963 bio->bi_iter.bi_sector += zwplug->wp_offset; in blk_zone_wplug_prepare_bio()
972 * Check for non-sequential writes early as we know that BIOs in blk_zone_wplug_prepare_bio()
973 * with a start sector not unaligned to the zone write pointer in blk_zone_wplug_prepare_bio()
976 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) in blk_zone_wplug_prepare_bio()
980 /* Advance the zone write pointer offset. */ in blk_zone_wplug_prepare_bio()
981 zwplug->wp_offset += bio_sectors(bio); in blk_zone_wplug_prepare_bio()
988 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_write()
989 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_write()
996 * zone write plug for the entire BIO. For blk-mq devices, the block in blk_zone_wplug_handle_write()
997 * layer should already have done any splitting required to ensure this in blk_zone_wplug_handle_write()
999 * BIO-based devices, it is the responsibility of the driver to split in blk_zone_wplug_handle_write()
1007 /* Conventional zones do not need write plugging. */ in blk_zone_wplug_handle_write()
1008 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { in blk_zone_wplug_handle_write()
1009 /* Zone append to conventional zones is not allowed. */ in blk_zone_wplug_handle_write()
1017 if (bio->bi_opf & REQ_NOWAIT) in blk_zone_wplug_handle_write()
1022 if (bio->bi_opf & REQ_NOWAIT) in blk_zone_wplug_handle_write()
1029 /* Indicate that this BIO is being handled using zone write plugging. */ in blk_zone_wplug_handle_write()
1033 * If the zone is already plugged, add the BIO to the plug BIO list. in blk_zone_wplug_handle_write()
1034 * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a in blk_zone_wplug_handle_write()
1038 if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || in blk_zone_wplug_handle_write()
1039 (bio->bi_opf & REQ_NOWAIT)) in blk_zone_wplug_handle_write()
1043 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1048 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_handle_write()
1050 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1057 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1064 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_native_zone_append()
1070 * going to handle @bio through plugging. However, we may already have a in blk_zone_wplug_handle_native_zone_append()
1071 * zone write plug for the target zone if that zone was previously in blk_zone_wplug_handle_native_zone_append()
1074 * zone append operations. Avoid this by removing the zone write plug. in blk_zone_wplug_handle_native_zone_append()
1076 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); in blk_zone_wplug_handle_native_zone_append()
1080 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_handle_native_zone_append()
1083 * We are about to remove the zone write plug. But if the user in blk_zone_wplug_handle_native_zone_append()
1087 * return NULL after the plug is removed. Aborting the plugged write in blk_zone_wplug_handle_native_zone_append()
1090 * operations and regular write operations. in blk_zone_wplug_handle_native_zone_append()
1092 if (!bio_list_empty(&zwplug->bio_list)) { in blk_zone_wplug_handle_native_zone_append()
1094 disk->disk_name, zwplug->zone_no); in blk_zone_wplug_handle_native_zone_append()
1098 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_native_zone_append()
1104 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1108 * Handle write, write zeroes and zone append operations requiring emulation
1109 * using zone write plugging.
1111 * Return true whenever @bio execution needs to be delayed through the zone
1112 * write plug. Otherwise, return false to let the submission path process
1117 struct block_device *bdev = bio->bi_bdev; in blk_zone_plug_bio()
1119 if (!bdev->bd_disk->zone_wplugs_hash) in blk_zone_plug_bio()
1131 * We do not need to do anything special for empty flush BIOs, e.g in blk_zone_plug_bio()
1133 * the responsibility of the user to first wait for the completion of in blk_zone_plug_bio()
1134 * write operations for flush to have any effect on the persistence of in blk_zone_plug_bio()
1137 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) in blk_zone_plug_bio()
1141 * Regular writes and write zeroes need to be handled through the target in blk_zone_plug_bio()
1142 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH in blk_zone_plug_bio()
1143 * which may need to go through the flush machinery depending on the in blk_zone_plug_bio()
1147 * completion, which will handle zone write plugging. in blk_zone_plug_bio()
1150 * write BIOs. in blk_zone_plug_bio()
1152 * to correctly track the write pointer offset of zones. These commands in blk_zone_plug_bio()
1153 * are not plugged as we do not need serialization with write in blk_zone_plug_bio()
1154 * operations. It is the responsibility of the user to not issue reset in blk_zone_plug_bio()
1155 * and finish commands when write operations are in flight. in blk_zone_plug_bio()
1187 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1190 if (!bio_list_empty(&zwplug->bio_list)) { in disk_zone_wplug_unplug_bio()
1192 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1196 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_unplug_bio()
1200 * (it was reset), remove its zone write plug from the hash table. in disk_zone_wplug_unplug_bio()
1205 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1210 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_write_plug_bio_endio()
1212 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_endio()
1222 * If this is a regular write emulating a zone append operation, in blk_zone_write_plug_bio_endio()
1226 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_write_plug_bio_endio()
1227 bio->bi_opf |= REQ_OP_ZONE_APPEND; in blk_zone_write_plug_bio_endio()
1232 * needing a write pointer update. in blk_zone_write_plug_bio_endio()
1234 if (bio->bi_status != BLK_STS_OK) { in blk_zone_write_plug_bio_endio()
1235 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1237 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; in blk_zone_write_plug_bio_endio()
1238 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1245 * For BIO-based devices, blk_zone_write_plug_finish_request() in blk_zone_write_plug_bio_endio()
1246 * is not called. So we need to schedule execution of the next in blk_zone_write_plug_bio_endio()
1249 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) in blk_zone_write_plug_bio_endio()
1258 struct gendisk *disk = req->q->disk; in blk_zone_write_plug_finish_request()
1261 zwplug = disk_get_zone_wplug(disk, req->__sector); in blk_zone_write_plug_finish_request()
1265 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_finish_request()
1291 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1294 bio = bio_list_pop(&zwplug->bio_list); in blk_zone_wplug_bio_work()
1296 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_bio_work()
1297 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1306 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1308 bdev = bio->bi_bdev; in blk_zone_wplug_bio_work()
1312 * blk-mq devices will reuse the extra reference on the request queue in blk_zone_wplug_bio_work()
1314 * path for BIO-based devices will not do that. So drop this extra in blk_zone_wplug_bio_work()
1318 blk_queue_exit(bdev->bd_disk->queue); in blk_zone_wplug_bio_work()
1327 return 1U << disk->zone_wplugs_hash_bits; in disk_zone_wplugs_hash_size()
1332 spin_lock_init(&disk->zone_wplugs_lock); in disk_init_zone_resources()
1336 * For the size of a disk zone write plug hash table, use the size of the
1337 * zone write plug mempool, which is the maximum of the disk open zones and
1339 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1349 atomic_set(&disk->nr_zone_wplugs, 0); in disk_alloc_zone_resources()
1350 disk->zone_wplugs_hash_bits = in disk_alloc_zone_resources()
1353 disk->zone_wplugs_hash = in disk_alloc_zone_resources()
1356 if (!disk->zone_wplugs_hash) in disk_alloc_zone_resources()
1357 return -ENOMEM; in disk_alloc_zone_resources()
1360 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); in disk_alloc_zone_resources()
1362 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, in disk_alloc_zone_resources()
1364 if (!disk->zone_wplugs_pool) in disk_alloc_zone_resources()
1367 disk->zone_wplugs_wq = in disk_alloc_zone_resources()
1369 pool_size, disk->disk_name); in disk_alloc_zone_resources()
1370 if (!disk->zone_wplugs_wq) in disk_alloc_zone_resources()
1376 mempool_destroy(disk->zone_wplugs_pool); in disk_alloc_zone_resources()
1377 disk->zone_wplugs_pool = NULL; in disk_alloc_zone_resources()
1379 kfree(disk->zone_wplugs_hash); in disk_alloc_zone_resources()
1380 disk->zone_wplugs_hash = NULL; in disk_alloc_zone_resources()
1381 disk->zone_wplugs_hash_bits = 0; in disk_alloc_zone_resources()
1382 return -ENOMEM; in disk_alloc_zone_resources()
1390 if (!disk->zone_wplugs_hash) in disk_destroy_zone_wplugs_hash_table()
1393 /* Free all the zone write plugs we have. */ in disk_destroy_zone_wplugs_hash_table()
1395 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { in disk_destroy_zone_wplugs_hash_table()
1396 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, in disk_destroy_zone_wplugs_hash_table()
1398 refcount_inc(&zwplug->ref); in disk_destroy_zone_wplugs_hash_table()
1404 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); in disk_destroy_zone_wplugs_hash_table()
1405 kfree(disk->zone_wplugs_hash); in disk_destroy_zone_wplugs_hash_table()
1406 disk->zone_wplugs_hash = NULL; in disk_destroy_zone_wplugs_hash_table()
1407 disk->zone_wplugs_hash_bits = 0; in disk_destroy_zone_wplugs_hash_table()
1416 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_set_conv_zones_bitmap()
1418 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); in disk_set_conv_zones_bitmap()
1419 bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, in disk_set_conv_zones_bitmap()
1420 lockdep_is_held(&disk->zone_wplugs_lock)); in disk_set_conv_zones_bitmap()
1421 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_set_conv_zones_bitmap()
1430 if (!disk->zone_wplugs_pool) in disk_free_zone_resources()
1433 if (disk->zone_wplugs_wq) { in disk_free_zone_resources()
1434 destroy_workqueue(disk->zone_wplugs_wq); in disk_free_zone_resources()
1435 disk->zone_wplugs_wq = NULL; in disk_free_zone_resources()
1441 * Wait for the zone write plugs to be RCU-freed before in disk_free_zone_resources()
1446 mempool_destroy(disk->zone_wplugs_pool); in disk_free_zone_resources()
1447 disk->zone_wplugs_pool = NULL; in disk_free_zone_resources()
1450 disk->zone_capacity = 0; in disk_free_zone_resources()
1451 disk->last_zone_capacity = 0; in disk_free_zone_resources()
1452 disk->nr_zones = 0; in disk_free_zone_resources()
1459 * can automatically handle write BIO plugging. BIO-based device drivers in disk_need_zone_resources()
1460 * (e.g. DM devices) are normally responsible for handling zone write in disk_need_zone_resources()
1464 return queue_is_mq(disk->queue) || in disk_need_zone_resources()
1465 queue_emulates_zone_append(disk->queue); in disk_need_zone_resources()
1471 struct queue_limits *lim = &disk->queue->limits; in disk_revalidate_zone_resources()
1481 pool_size = max(lim->max_open_zones, lim->max_active_zones); in disk_revalidate_zone_resources()
1485 if (!disk->zone_wplugs_hash) in disk_revalidate_zone_resources()
1507 struct request_queue *q = disk->queue; in disk_update_zone_resources()
1512 disk->nr_zones = args->nr_zones; in disk_update_zone_resources()
1513 disk->zone_capacity = args->zone_capacity; in disk_update_zone_resources()
1514 disk->last_zone_capacity = args->last_zone_capacity; in disk_update_zone_resources()
1516 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); in disk_update_zone_resources()
1517 if (nr_conv_zones >= disk->nr_zones) { in disk_update_zone_resources()
1519 disk->disk_name, nr_conv_zones, disk->nr_zones); in disk_update_zone_resources()
1520 return -ENODEV; in disk_update_zone_resources()
1531 nr_seq_zones = disk->nr_zones - nr_conv_zones; in disk_update_zone_resources()
1537 if (!disk->zone_wplugs_pool) in disk_update_zone_resources()
1542 * zones, set its max open zone limit to the mempool size to indicate in disk_update_zone_resources()
1543 * to the user that there is a potential performance impact due to in disk_update_zone_resources()
1544 * dynamic zone write plug allocation when simultaneously writing to in disk_update_zone_resources()
1551 mempool_resize(disk->zone_wplugs_pool, pool_size); in disk_update_zone_resources()
1567 struct gendisk *disk = args->disk; in blk_revalidate_conv_zone()
1569 if (zone->capacity != zone->len) { in blk_revalidate_conv_zone()
1571 disk->disk_name); in blk_revalidate_conv_zone()
1572 return -ENODEV; in blk_revalidate_conv_zone()
1576 args->last_zone_capacity = zone->capacity; in blk_revalidate_conv_zone()
1581 if (!args->conv_zones_bitmap) { in blk_revalidate_conv_zone()
1582 args->conv_zones_bitmap = in blk_revalidate_conv_zone()
1583 bitmap_zalloc(args->nr_zones, GFP_NOIO); in blk_revalidate_conv_zone()
1584 if (!args->conv_zones_bitmap) in blk_revalidate_conv_zone()
1585 return -ENOMEM; in blk_revalidate_conv_zone()
1588 set_bit(idx, args->conv_zones_bitmap); in blk_revalidate_conv_zone()
1596 struct gendisk *disk = args->disk; in blk_revalidate_seq_zone()
1606 if (!args->zone_capacity) in blk_revalidate_seq_zone()
1607 args->zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1609 args->last_zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1610 } else if (zone->capacity != args->zone_capacity) { in blk_revalidate_seq_zone()
1612 disk->disk_name); in blk_revalidate_seq_zone()
1613 return -ENODEV; in blk_revalidate_seq_zone()
1617 * If the device needs zone append emulation, we need to track the in blk_revalidate_seq_zone()
1618 * write pointer of all zones that are not empty nor full. So make sure in blk_revalidate_seq_zone()
1619 * we have a zone write plug for such zone if the device has a zone in blk_revalidate_seq_zone()
1620 * write plug hash table. in blk_revalidate_seq_zone()
1622 if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) in blk_revalidate_seq_zone()
1628 if (!wp_offset || wp_offset >= zone->capacity) in blk_revalidate_seq_zone()
1631 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); in blk_revalidate_seq_zone()
1633 return -ENOMEM; in blk_revalidate_seq_zone()
1634 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_revalidate_seq_zone()
1641 * Helper function to check the validity of zones of a zoned block device.
1647 struct gendisk *disk = args->disk; in blk_revalidate_zone_cb()
1648 sector_t zone_sectors = disk->queue->limits.chunk_sectors; in blk_revalidate_zone_cb()
1652 if (zone->start != args->sector) { in blk_revalidate_zone_cb()
1654 disk->disk_name, args->sector, zone->start); in blk_revalidate_zone_cb()
1655 return -ENODEV; in blk_revalidate_zone_cb()
1658 if (zone->start >= get_capacity(disk) || !zone->len) { in blk_revalidate_zone_cb()
1660 disk->disk_name, zone->start, zone->len); in blk_revalidate_zone_cb()
1661 return -ENODEV; in blk_revalidate_zone_cb()
1669 if (zone->len != zone_sectors) { in blk_revalidate_zone_cb()
1671 disk->disk_name); in blk_revalidate_zone_cb()
1672 return -ENODEV; in blk_revalidate_zone_cb()
1674 } else if (zone->len > zone_sectors) { in blk_revalidate_zone_cb()
1676 disk->disk_name); in blk_revalidate_zone_cb()
1677 return -ENODEV; in blk_revalidate_zone_cb()
1680 if (!zone->capacity || zone->capacity > zone->len) { in blk_revalidate_zone_cb()
1682 disk->disk_name); in blk_revalidate_zone_cb()
1683 return -ENODEV; in blk_revalidate_zone_cb()
1687 switch (zone->type) { in blk_revalidate_zone_cb()
1697 disk->disk_name, (int)zone->type, zone->start); in blk_revalidate_zone_cb()
1698 ret = -ENODEV; in blk_revalidate_zone_cb()
1702 args->sector += zone->len; in blk_revalidate_zone_cb()
1708 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1711 * Helper function for low-level device drivers to check, (re) allocate and
1713 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1722 struct request_queue *q = disk->queue; in blk_revalidate_disk_zones()
1723 sector_t zone_sectors = q->limits.chunk_sectors; in blk_revalidate_disk_zones()
1727 int ret = -ENOMEM; in blk_revalidate_disk_zones()
1730 return -EIO; in blk_revalidate_disk_zones()
1733 return -ENODEV; in blk_revalidate_disk_zones()
1741 disk->disk_name, zone_sectors); in blk_revalidate_disk_zones()
1742 return -ENODEV; in blk_revalidate_disk_zones()
1750 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); in blk_revalidate_disk_zones()
1758 ret = disk->fops->report_zones(disk, 0, UINT_MAX, in blk_revalidate_disk_zones()
1761 pr_warn("%s: No zones reported\n", disk->disk_name); in blk_revalidate_disk_zones()
1762 ret = -ENODEV; in blk_revalidate_disk_zones()
1772 disk->disk_name, args.sector); in blk_revalidate_disk_zones()
1773 ret = -ENODEV; in blk_revalidate_disk_zones()
1783 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); in blk_revalidate_disk_zones()
1796 * blk_zone_issue_zeroout - zero-fill a block range in a zone
1797 * @bdev: blockdev to write
1799 * @nr_sects: number of sectors to write
1803 * Zero-fill a block range in a zone (@sector must be equal to the zone write
1804 * pointer), handling potential errors due to the (initially unknown) lack of
1813 return -EIO; in blk_zone_issue_zeroout()
1817 if (ret != -EOPNOTSUPP) in blk_zone_issue_zeroout()
1821 * The failed call to blkdev_issue_zeroout() advanced the zone write in blk_zone_issue_zeroout()
1822 * pointer. Undo this using a report zone to update the zone write in blk_zone_issue_zeroout()
1823 * pointer to the correct current value. in blk_zone_issue_zeroout()
1825 ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); in blk_zone_issue_zeroout()
1827 return ret < 0 ? ret : -EIO; in blk_zone_issue_zeroout()
1830 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a in blk_zone_issue_zeroout()
1831 * regular write with zero-pages. in blk_zone_issue_zeroout()
1846 spin_lock_irqsave(&zwplug->lock, flags); in queue_zone_wplug_show()
1847 zwp_zone_no = zwplug->zone_no; in queue_zone_wplug_show()
1848 zwp_flags = zwplug->flags; in queue_zone_wplug_show()
1849 zwp_ref = refcount_read(&zwplug->ref); in queue_zone_wplug_show()
1850 zwp_wp_offset = zwplug->wp_offset; in queue_zone_wplug_show()
1851 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); in queue_zone_wplug_show()
1852 spin_unlock_irqrestore(&zwplug->lock, flags); in queue_zone_wplug_show()
1861 struct gendisk *disk = q->disk; in queue_zone_wplugs_show()
1865 if (!disk->zone_wplugs_hash) in queue_zone_wplugs_show()
1870 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], in queue_zone_wplugs_show()