1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 #include <linux/spinlock.h> 17 #include <linux/refcount.h> 18 #include <linux/mempool.h> 19 20 #include "blk.h" 21 #include "blk-mq-sched.h" 22 #include "blk-mq-debugfs.h" 23 24 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 25 static const char *const zone_cond_name[] = { 26 ZONE_COND_NAME(NOT_WP), 27 ZONE_COND_NAME(EMPTY), 28 ZONE_COND_NAME(IMP_OPEN), 29 ZONE_COND_NAME(EXP_OPEN), 30 ZONE_COND_NAME(CLOSED), 31 ZONE_COND_NAME(READONLY), 32 ZONE_COND_NAME(FULL), 33 ZONE_COND_NAME(OFFLINE), 34 }; 35 #undef ZONE_COND_NAME 36 37 /* 38 * Per-zone write plug. 39 * @node: hlist_node structure for managing the plug using a hash table. 40 * @ref: Zone write plug reference counter. A zone write plug reference is 41 * always at least 1 when the plug is hashed in the disk plug hash table. 42 * The reference is incremented whenever a new BIO needing plugging is 43 * submitted and when a function needs to manipulate a plug. The 44 * reference count is decremented whenever a plugged BIO completes and 45 * when a function that referenced the plug returns. The initial 46 * reference is dropped whenever the zone of the zone write plug is reset, 47 * finished and when the zone becomes full (last write BIO to the zone 48 * completes). 49 * @lock: Spinlock to atomically manipulate the plug. 50 * @flags: Flags indicating the plug state. 51 * @zone_no: The number of the zone the plug is managing. 52 * @wp_offset: The zone write pointer location relative to the start of the zone 53 * as a number of 512B sectors. 54 * @bio_list: The list of BIOs that are currently plugged. 55 * @bio_work: Work struct to handle issuing of plugged BIOs 56 * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 57 * @disk: The gendisk the plug belongs to. 58 */ 59 struct blk_zone_wplug { 60 struct hlist_node node; 61 refcount_t ref; 62 spinlock_t lock; 63 unsigned int flags; 64 unsigned int zone_no; 65 unsigned int wp_offset; 66 struct bio_list bio_list; 67 struct work_struct bio_work; 68 struct rcu_head rcu_head; 69 struct gendisk *disk; 70 }; 71 72 /* 73 * Zone write plug flags bits: 74 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 75 * that is, that write BIOs are being throttled due to a write BIO already 76 * being executed or the zone write plug bio list is not empty. 77 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone 78 * write pointer offset and need to update it. 79 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed 80 * from the disk hash table and that the initial reference to the zone 81 * write plug set when the plug was first added to the hash table has been 82 * dropped. This flag is set when a zone is reset, finished or become full, 83 * to prevent new references to the zone write plug to be taken for 84 * newly incoming BIOs. A zone write plug flagged with this flag will be 85 * freed once all remaining references from BIOs or functions are dropped. 86 */ 87 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 88 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) 89 #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) 90 91 /** 92 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. 93 * @zone_cond: BLK_ZONE_COND_XXX. 94 * 95 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX 96 * into string format. Useful in the debugging and tracing zone conditions. For 97 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". 98 */ 99 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 100 { 101 static const char *zone_cond_str = "UNKNOWN"; 102 103 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 104 zone_cond_str = zone_cond_name[zone_cond]; 105 106 return zone_cond_str; 107 } 108 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 109 110 struct disk_report_zones_cb_args { 111 struct gendisk *disk; 112 report_zones_cb user_cb; 113 void *user_data; 114 }; 115 116 static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 117 struct blk_zone *zone); 118 119 static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, 120 void *data) 121 { 122 struct disk_report_zones_cb_args *args = data; 123 struct gendisk *disk = args->disk; 124 125 if (disk->zone_wplugs_hash) 126 disk_zone_wplug_sync_wp_offset(disk, zone); 127 128 if (!args->user_cb) 129 return 0; 130 131 return args->user_cb(zone, idx, args->user_data); 132 } 133 134 /** 135 * blkdev_report_zones - Get zones information 136 * @bdev: Target block device 137 * @sector: Sector from which to report zones 138 * @nr_zones: Maximum number of zones to report 139 * @cb: Callback function called for each reported zone 140 * @data: Private data for the callback 141 * 142 * Description: 143 * Get zone information starting from the zone containing @sector for at most 144 * @nr_zones, and call @cb for each zone reported by the device. 145 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 146 * constant can be passed to @nr_zones. 147 * Returns the number of zones reported by the device, or a negative errno 148 * value in case of failure. 149 * 150 * Note: The caller must use memalloc_noXX_save/restore() calls to control 151 * memory allocations done within this function. 152 */ 153 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 154 unsigned int nr_zones, report_zones_cb cb, void *data) 155 { 156 struct gendisk *disk = bdev->bd_disk; 157 sector_t capacity = get_capacity(disk); 158 struct disk_report_zones_cb_args args = { 159 .disk = disk, 160 .user_cb = cb, 161 .user_data = data, 162 }; 163 164 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) 165 return -EOPNOTSUPP; 166 167 if (!nr_zones || sector >= capacity) 168 return 0; 169 170 return disk->fops->report_zones(disk, sector, nr_zones, 171 disk_report_zones_cb, &args); 172 } 173 EXPORT_SYMBOL_GPL(blkdev_report_zones); 174 175 static int blkdev_zone_reset_all(struct block_device *bdev) 176 { 177 struct bio bio; 178 179 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); 180 return submit_bio_wait(&bio); 181 } 182 183 /** 184 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 185 * @bdev: Target block device 186 * @op: Operation to be performed on the zones 187 * @sector: Start sector of the first zone to operate on 188 * @nr_sectors: Number of sectors, should be at least the length of one zone and 189 * must be zone size aligned. 190 * 191 * Description: 192 * Perform the specified operation on the range of zones specified by 193 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 194 * is valid, but the specified range should not contain conventional zones. 195 * The operation to execute on each zone can be a zone reset, open, close 196 * or finish request. 197 */ 198 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 199 sector_t sector, sector_t nr_sectors) 200 { 201 sector_t zone_sectors = bdev_zone_sectors(bdev); 202 sector_t capacity = bdev_nr_sectors(bdev); 203 sector_t end_sector = sector + nr_sectors; 204 struct bio *bio = NULL; 205 int ret = 0; 206 207 if (!bdev_is_zoned(bdev)) 208 return -EOPNOTSUPP; 209 210 if (bdev_read_only(bdev)) 211 return -EPERM; 212 213 if (!op_is_zone_mgmt(op)) 214 return -EOPNOTSUPP; 215 216 if (end_sector <= sector || end_sector > capacity) 217 /* Out of range */ 218 return -EINVAL; 219 220 /* Check alignment (handle eventual smaller last zone) */ 221 if (!bdev_is_zone_start(bdev, sector)) 222 return -EINVAL; 223 224 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) 225 return -EINVAL; 226 227 /* 228 * In the case of a zone reset operation over all zones, use 229 * REQ_OP_ZONE_RESET_ALL. 230 */ 231 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) 232 return blkdev_zone_reset_all(bdev); 233 234 while (sector < end_sector) { 235 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 236 bio->bi_iter.bi_sector = sector; 237 sector += zone_sectors; 238 239 /* This may take a while, so be nice to others */ 240 cond_resched(); 241 } 242 243 ret = submit_bio_wait(bio); 244 bio_put(bio); 245 246 return ret; 247 } 248 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 249 250 struct zone_report_args { 251 struct blk_zone __user *zones; 252 }; 253 254 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 255 void *data) 256 { 257 struct zone_report_args *args = data; 258 259 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 260 return -EFAULT; 261 return 0; 262 } 263 264 /* 265 * BLKREPORTZONE ioctl processing. 266 * Called from blkdev_ioctl. 267 */ 268 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 269 unsigned long arg) 270 { 271 void __user *argp = (void __user *)arg; 272 struct zone_report_args args; 273 struct blk_zone_report rep; 274 int ret; 275 276 if (!argp) 277 return -EINVAL; 278 279 if (!bdev_is_zoned(bdev)) 280 return -ENOTTY; 281 282 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 283 return -EFAULT; 284 285 if (!rep.nr_zones) 286 return -EINVAL; 287 288 args.zones = argp + sizeof(struct blk_zone_report); 289 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 290 blkdev_copy_zone_to_user, &args); 291 if (ret < 0) 292 return ret; 293 294 rep.nr_zones = ret; 295 rep.flags = BLK_ZONE_REP_CAPACITY; 296 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 297 return -EFAULT; 298 return 0; 299 } 300 301 static int blkdev_truncate_zone_range(struct block_device *bdev, 302 blk_mode_t mode, const struct blk_zone_range *zrange) 303 { 304 loff_t start, end; 305 306 if (zrange->sector + zrange->nr_sectors <= zrange->sector || 307 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) 308 /* Out of range */ 309 return -EINVAL; 310 311 start = zrange->sector << SECTOR_SHIFT; 312 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; 313 314 return truncate_bdev_range(bdev, mode, start, end); 315 } 316 317 /* 318 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 319 * Called from blkdev_ioctl. 320 */ 321 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 322 unsigned int cmd, unsigned long arg) 323 { 324 void __user *argp = (void __user *)arg; 325 struct blk_zone_range zrange; 326 enum req_op op; 327 int ret; 328 329 if (!argp) 330 return -EINVAL; 331 332 if (!bdev_is_zoned(bdev)) 333 return -ENOTTY; 334 335 if (!(mode & BLK_OPEN_WRITE)) 336 return -EBADF; 337 338 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 339 return -EFAULT; 340 341 switch (cmd) { 342 case BLKRESETZONE: 343 op = REQ_OP_ZONE_RESET; 344 345 /* Invalidate the page cache, including dirty pages. */ 346 filemap_invalidate_lock(bdev->bd_mapping); 347 ret = blkdev_truncate_zone_range(bdev, mode, &zrange); 348 if (ret) 349 goto fail; 350 break; 351 case BLKOPENZONE: 352 op = REQ_OP_ZONE_OPEN; 353 break; 354 case BLKCLOSEZONE: 355 op = REQ_OP_ZONE_CLOSE; 356 break; 357 case BLKFINISHZONE: 358 op = REQ_OP_ZONE_FINISH; 359 break; 360 default: 361 return -ENOTTY; 362 } 363 364 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 365 366 fail: 367 if (cmd == BLKRESETZONE) 368 filemap_invalidate_unlock(bdev->bd_mapping); 369 370 return ret; 371 } 372 373 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 374 { 375 return zone->start + zone->len >= get_capacity(disk); 376 } 377 378 static bool disk_zone_is_full(struct gendisk *disk, 379 unsigned int zno, unsigned int offset_in_zone) 380 { 381 if (zno < disk->nr_zones - 1) 382 return offset_in_zone >= disk->zone_capacity; 383 return offset_in_zone >= disk->last_zone_capacity; 384 } 385 386 static bool disk_zone_wplug_is_full(struct gendisk *disk, 387 struct blk_zone_wplug *zwplug) 388 { 389 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); 390 } 391 392 static bool disk_insert_zone_wplug(struct gendisk *disk, 393 struct blk_zone_wplug *zwplug) 394 { 395 struct blk_zone_wplug *zwplg; 396 unsigned long flags; 397 unsigned int idx = 398 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 399 400 /* 401 * Add the new zone write plug to the hash table, but carefully as we 402 * are racing with other submission context, so we may already have a 403 * zone write plug for the same zone. 404 */ 405 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 406 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 407 if (zwplg->zone_no == zwplug->zone_no) { 408 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 409 return false; 410 } 411 } 412 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 413 atomic_inc(&disk->nr_zone_wplugs); 414 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 415 416 return true; 417 } 418 419 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, 420 sector_t sector) 421 { 422 unsigned int zno = disk_zone_no(disk, sector); 423 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 424 struct blk_zone_wplug *zwplug; 425 426 rcu_read_lock(); 427 428 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 429 if (zwplug->zone_no == zno && 430 refcount_inc_not_zero(&zwplug->ref)) { 431 rcu_read_unlock(); 432 return zwplug; 433 } 434 } 435 436 rcu_read_unlock(); 437 438 return NULL; 439 } 440 441 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 442 sector_t sector) 443 { 444 if (!atomic_read(&disk->nr_zone_wplugs)) 445 return NULL; 446 447 return disk_get_hashed_zone_wplug(disk, sector); 448 } 449 450 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 451 { 452 struct blk_zone_wplug *zwplug = 453 container_of(rcu_head, struct blk_zone_wplug, rcu_head); 454 455 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 456 } 457 458 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 459 { 460 if (refcount_dec_and_test(&zwplug->ref)) { 461 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 462 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 463 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); 464 465 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 466 } 467 } 468 469 static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, 470 struct blk_zone_wplug *zwplug) 471 { 472 lockdep_assert_held(&zwplug->lock); 473 474 /* If the zone write plug was already removed, we are done. */ 475 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 476 return false; 477 478 /* If the zone write plug is still plugged, it cannot be removed. */ 479 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 480 return false; 481 482 /* 483 * Completions of BIOs with blk_zone_write_plug_bio_endio() may 484 * happen after handling a request completion with 485 * blk_zone_write_plug_finish_request() (e.g. with split BIOs 486 * that are chained). In such case, disk_zone_wplug_unplug_bio() 487 * should not attempt to remove the zone write plug until all BIO 488 * completions are seen. Check by looking at the zone write plug 489 * reference count, which is 2 when the plug is unused (one reference 490 * taken when the plug was allocated and another reference taken by the 491 * caller context). 492 */ 493 if (refcount_read(&zwplug->ref) > 2) 494 return false; 495 496 /* We can remove zone write plugs for zones that are empty or full. */ 497 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); 498 } 499 500 static void disk_remove_zone_wplug(struct gendisk *disk, 501 struct blk_zone_wplug *zwplug) 502 { 503 unsigned long flags; 504 505 /* If the zone write plug was already removed, we have nothing to do. */ 506 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 507 return; 508 509 /* 510 * Mark the zone write plug as unhashed and drop the extra reference we 511 * took when the plug was inserted in the hash table. 512 */ 513 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; 514 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 515 hlist_del_init_rcu(&zwplug->node); 516 atomic_dec(&disk->nr_zone_wplugs); 517 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 518 disk_put_zone_wplug(zwplug); 519 } 520 521 static void blk_zone_wplug_bio_work(struct work_struct *work); 522 523 /* 524 * Get a reference on the write plug for the zone containing @sector. 525 * If the plug does not exist, it is allocated and hashed. 526 * Return a pointer to the zone write plug with the plug spinlock held. 527 */ 528 static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, 529 sector_t sector, gfp_t gfp_mask, 530 unsigned long *flags) 531 { 532 unsigned int zno = disk_zone_no(disk, sector); 533 struct blk_zone_wplug *zwplug; 534 535 again: 536 zwplug = disk_get_zone_wplug(disk, sector); 537 if (zwplug) { 538 /* 539 * Check that a BIO completion or a zone reset or finish 540 * operation has not already removed the zone write plug from 541 * the hash table and dropped its reference count. In such case, 542 * we need to get a new plug so start over from the beginning. 543 */ 544 spin_lock_irqsave(&zwplug->lock, *flags); 545 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { 546 spin_unlock_irqrestore(&zwplug->lock, *flags); 547 disk_put_zone_wplug(zwplug); 548 goto again; 549 } 550 return zwplug; 551 } 552 553 /* 554 * Allocate and initialize a zone write plug with an extra reference 555 * so that it is not freed when the zone write plug becomes idle without 556 * the zone being full. 557 */ 558 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 559 if (!zwplug) 560 return NULL; 561 562 INIT_HLIST_NODE(&zwplug->node); 563 refcount_set(&zwplug->ref, 2); 564 spin_lock_init(&zwplug->lock); 565 zwplug->flags = 0; 566 zwplug->zone_no = zno; 567 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); 568 bio_list_init(&zwplug->bio_list); 569 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 570 zwplug->disk = disk; 571 572 spin_lock_irqsave(&zwplug->lock, *flags); 573 574 /* 575 * Insert the new zone write plug in the hash table. This can fail only 576 * if another context already inserted a plug. Retry from the beginning 577 * in such case. 578 */ 579 if (!disk_insert_zone_wplug(disk, zwplug)) { 580 spin_unlock_irqrestore(&zwplug->lock, *flags); 581 mempool_free(zwplug, disk->zone_wplugs_pool); 582 goto again; 583 } 584 585 return zwplug; 586 } 587 588 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 589 struct bio *bio) 590 { 591 struct request_queue *q = zwplug->disk->queue; 592 593 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 594 bio_io_error(bio); 595 disk_put_zone_wplug(zwplug); 596 /* Drop the reference taken by disk_zone_wplug_add_bio(() */ 597 blk_queue_exit(q); 598 } 599 600 /* 601 * Abort (fail) all plugged BIOs of a zone write plug. 602 */ 603 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 604 { 605 struct bio *bio; 606 607 if (bio_list_empty(&zwplug->bio_list)) 608 return; 609 610 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", 611 zwplug->disk->disk_name, zwplug->zone_no); 612 while ((bio = bio_list_pop(&zwplug->bio_list))) 613 blk_zone_wplug_bio_io_error(zwplug, bio); 614 } 615 616 /* 617 * Set a zone write plug write pointer offset to the specified value. 618 * This aborts all plugged BIOs, which is fine as this function is called for 619 * a zone reset operation, a zone finish operation or if the zone needs a wp 620 * update from a report zone after a write error. 621 */ 622 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 623 struct blk_zone_wplug *zwplug, 624 unsigned int wp_offset) 625 { 626 lockdep_assert_held(&zwplug->lock); 627 628 /* Update the zone write pointer and abort all plugged BIOs. */ 629 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; 630 zwplug->wp_offset = wp_offset; 631 disk_zone_wplug_abort(zwplug); 632 633 /* 634 * The zone write plug now has no BIO plugged: remove it from the 635 * hash table so that it cannot be seen. The plug will be freed 636 * when the last reference is dropped. 637 */ 638 if (disk_should_remove_zone_wplug(disk, zwplug)) 639 disk_remove_zone_wplug(disk, zwplug); 640 } 641 642 static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 643 { 644 switch (zone->cond) { 645 case BLK_ZONE_COND_IMP_OPEN: 646 case BLK_ZONE_COND_EXP_OPEN: 647 case BLK_ZONE_COND_CLOSED: 648 return zone->wp - zone->start; 649 case BLK_ZONE_COND_FULL: 650 return zone->len; 651 case BLK_ZONE_COND_EMPTY: 652 return 0; 653 case BLK_ZONE_COND_NOT_WP: 654 case BLK_ZONE_COND_OFFLINE: 655 case BLK_ZONE_COND_READONLY: 656 default: 657 /* 658 * Conventional, offline and read-only zones do not have a valid 659 * write pointer. 660 */ 661 return UINT_MAX; 662 } 663 } 664 665 static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 666 struct blk_zone *zone) 667 { 668 struct blk_zone_wplug *zwplug; 669 unsigned long flags; 670 671 zwplug = disk_get_zone_wplug(disk, zone->start); 672 if (!zwplug) 673 return; 674 675 spin_lock_irqsave(&zwplug->lock, flags); 676 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 677 disk_zone_wplug_set_wp_offset(disk, zwplug, 678 blk_zone_wp_offset(zone)); 679 spin_unlock_irqrestore(&zwplug->lock, flags); 680 681 disk_put_zone_wplug(zwplug); 682 } 683 684 static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) 685 { 686 struct disk_report_zones_cb_args args = { 687 .disk = disk, 688 }; 689 690 return disk->fops->report_zones(disk, sector, 1, 691 disk_report_zones_cb, &args); 692 } 693 694 static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, 695 unsigned int wp_offset) 696 { 697 struct gendisk *disk = bio->bi_bdev->bd_disk; 698 sector_t sector = bio->bi_iter.bi_sector; 699 struct blk_zone_wplug *zwplug; 700 unsigned long flags; 701 702 /* Conventional zones cannot be reset nor finished. */ 703 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 704 bio_io_error(bio); 705 return true; 706 } 707 708 /* 709 * No-wait reset or finish BIOs do not make much sense as the callers 710 * issue these as blocking operations in most cases. To avoid issues 711 * the BIO execution potentially failing with BLK_STS_AGAIN, warn about 712 * REQ_NOWAIT being set and ignore that flag. 713 */ 714 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) 715 bio->bi_opf &= ~REQ_NOWAIT; 716 717 /* 718 * If we have a zone write plug, set its write pointer offset to 0 719 * (reset case) or to the zone size (finish case). This will abort all 720 * BIOs plugged for the target zone. It is fine as resetting or 721 * finishing zones while writes are still in-flight will result in the 722 * writes failing anyway. 723 */ 724 zwplug = disk_get_zone_wplug(disk, sector); 725 if (zwplug) { 726 spin_lock_irqsave(&zwplug->lock, flags); 727 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 728 spin_unlock_irqrestore(&zwplug->lock, flags); 729 disk_put_zone_wplug(zwplug); 730 } 731 732 return false; 733 } 734 735 static bool blk_zone_wplug_handle_reset_all(struct bio *bio) 736 { 737 struct gendisk *disk = bio->bi_bdev->bd_disk; 738 struct blk_zone_wplug *zwplug; 739 unsigned long flags; 740 sector_t sector; 741 742 /* 743 * Set the write pointer offset of all zone write plugs to 0. This will 744 * abort all plugged BIOs. It is fine as resetting zones while writes 745 * are still in-flight will result in the writes failing anyway. 746 */ 747 for (sector = 0; sector < get_capacity(disk); 748 sector += disk->queue->limits.chunk_sectors) { 749 zwplug = disk_get_zone_wplug(disk, sector); 750 if (zwplug) { 751 spin_lock_irqsave(&zwplug->lock, flags); 752 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 753 spin_unlock_irqrestore(&zwplug->lock, flags); 754 disk_put_zone_wplug(zwplug); 755 } 756 } 757 758 return false; 759 } 760 761 static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, 762 struct blk_zone_wplug *zwplug) 763 { 764 /* 765 * Take a reference on the zone write plug and schedule the submission 766 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the 767 * reference we take here. 768 */ 769 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 770 refcount_inc(&zwplug->ref); 771 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); 772 } 773 774 static inline void disk_zone_wplug_add_bio(struct gendisk *disk, 775 struct blk_zone_wplug *zwplug, 776 struct bio *bio, unsigned int nr_segs) 777 { 778 bool schedule_bio_work = false; 779 780 /* 781 * Grab an extra reference on the BIO request queue usage counter. 782 * This reference will be reused to submit a request for the BIO for 783 * blk-mq devices and dropped when the BIO is failed and after 784 * it is issued in the case of BIO-based devices. 785 */ 786 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 787 788 /* 789 * The BIO is being plugged and thus will have to wait for the on-going 790 * write and for all other writes already plugged. So polling makes 791 * no sense. 792 */ 793 bio_clear_polled(bio); 794 795 /* 796 * REQ_NOWAIT BIOs are always handled using the zone write plug BIO 797 * work, which can block. So clear the REQ_NOWAIT flag and schedule the 798 * work if this is the first BIO we are plugging. 799 */ 800 if (bio->bi_opf & REQ_NOWAIT) { 801 schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 802 bio->bi_opf &= ~REQ_NOWAIT; 803 } 804 805 /* 806 * Reuse the poll cookie field to store the number of segments when 807 * split to the hardware limits. 808 */ 809 bio->__bi_nr_segments = nr_segs; 810 811 /* 812 * We always receive BIOs after they are split and ready to be issued. 813 * The block layer passes the parts of a split BIO in order, and the 814 * user must also issue write sequentially. So simply add the new BIO 815 * at the tail of the list to preserve the sequential write order. 816 */ 817 bio_list_add(&zwplug->bio_list, bio); 818 819 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 820 821 if (schedule_bio_work) 822 disk_zone_wplug_schedule_bio_work(disk, zwplug); 823 } 824 825 /* 826 * Called from bio_attempt_back_merge() when a BIO was merged with a request. 827 */ 828 void blk_zone_write_plug_bio_merged(struct bio *bio) 829 { 830 struct blk_zone_wplug *zwplug; 831 unsigned long flags; 832 833 /* 834 * If the BIO was already plugged, then we were called through 835 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 836 * For this case, we already hold a reference on the zone write plug for 837 * the BIO and blk_zone_write_plug_init_request() will handle the 838 * zone write pointer offset update. 839 */ 840 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 841 return; 842 843 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 844 845 /* 846 * Get a reference on the zone write plug of the target zone and advance 847 * the zone write pointer offset. Given that this is a merge, we already 848 * have at least one request and one BIO referencing the zone write 849 * plug. So this should not fail. 850 */ 851 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, 852 bio->bi_iter.bi_sector); 853 if (WARN_ON_ONCE(!zwplug)) 854 return; 855 856 spin_lock_irqsave(&zwplug->lock, flags); 857 zwplug->wp_offset += bio_sectors(bio); 858 spin_unlock_irqrestore(&zwplug->lock, flags); 859 } 860 861 /* 862 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 863 * already went through zone write plugging (either a new BIO or one that was 864 * unplugged). 865 */ 866 void blk_zone_write_plug_init_request(struct request *req) 867 { 868 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 869 struct request_queue *q = req->q; 870 struct gendisk *disk = q->disk; 871 struct blk_zone_wplug *zwplug = 872 disk_get_zone_wplug(disk, blk_rq_pos(req)); 873 unsigned long flags; 874 struct bio *bio; 875 876 if (WARN_ON_ONCE(!zwplug)) 877 return; 878 879 /* 880 * Indicate that completion of this request needs to be handled with 881 * blk_zone_write_plug_finish_request(), which will drop the reference 882 * on the zone write plug we took above on entry to this function. 883 */ 884 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 885 886 if (blk_queue_nomerges(q)) 887 return; 888 889 /* 890 * Walk through the list of plugged BIOs to check if they can be merged 891 * into the back of the request. 892 */ 893 spin_lock_irqsave(&zwplug->lock, flags); 894 while (!disk_zone_wplug_is_full(disk, zwplug)) { 895 bio = bio_list_peek(&zwplug->bio_list); 896 if (!bio) 897 break; 898 899 if (bio->bi_iter.bi_sector != req_back_sector || 900 !blk_rq_merge_ok(req, bio)) 901 break; 902 903 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 904 !bio->__bi_nr_segments); 905 906 bio_list_pop(&zwplug->bio_list); 907 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 908 BIO_MERGE_OK) { 909 bio_list_add_head(&zwplug->bio_list, bio); 910 break; 911 } 912 913 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 914 blk_queue_exit(q); 915 zwplug->wp_offset += bio_sectors(bio); 916 917 req_back_sector += bio_sectors(bio); 918 } 919 spin_unlock_irqrestore(&zwplug->lock, flags); 920 } 921 922 /* 923 * Check and prepare a BIO for submission by incrementing the write pointer 924 * offset of its zone write plug and changing zone append operations into 925 * regular write when zone append emulation is needed. 926 */ 927 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 928 struct bio *bio) 929 { 930 struct gendisk *disk = bio->bi_bdev->bd_disk; 931 932 lockdep_assert_held(&zwplug->lock); 933 934 /* 935 * If we lost track of the zone write pointer due to a write error, 936 * the user must either execute a report zones, reset the zone or finish 937 * the to recover a reliable write pointer position. Fail BIOs if the 938 * user did not do that as we cannot handle emulated zone append 939 * otherwise. 940 */ 941 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 942 return false; 943 944 /* 945 * Check that the user is not attempting to write to a full zone. 946 * We know such BIO will fail, and that would potentially overflow our 947 * write pointer offset beyond the end of the zone. 948 */ 949 if (disk_zone_wplug_is_full(disk, zwplug)) 950 return false; 951 952 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 953 /* 954 * Use a regular write starting at the current write pointer. 955 * Similarly to native zone append operations, do not allow 956 * merging. 957 */ 958 bio->bi_opf &= ~REQ_OP_MASK; 959 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 960 bio->bi_iter.bi_sector += zwplug->wp_offset; 961 962 /* 963 * Remember that this BIO is in fact a zone append operation 964 * so that we can restore its operation code on completion. 965 */ 966 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 967 } else { 968 /* 969 * Check for non-sequential writes early as we know that BIOs 970 * with a start sector not unaligned to the zone write pointer 971 * will fail. 972 */ 973 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 974 return false; 975 } 976 977 /* Advance the zone write pointer offset. */ 978 zwplug->wp_offset += bio_sectors(bio); 979 980 return true; 981 } 982 983 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 984 { 985 struct gendisk *disk = bio->bi_bdev->bd_disk; 986 sector_t sector = bio->bi_iter.bi_sector; 987 struct blk_zone_wplug *zwplug; 988 gfp_t gfp_mask = GFP_NOIO; 989 unsigned long flags; 990 991 /* 992 * BIOs must be fully contained within a zone so that we use the correct 993 * zone write plug for the entire BIO. For blk-mq devices, the block 994 * layer should already have done any splitting required to ensure this 995 * and this BIO should thus not be straddling zone boundaries. For 996 * BIO-based devices, it is the responsibility of the driver to split 997 * the bio before submitting it. 998 */ 999 if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1000 bio_io_error(bio); 1001 return true; 1002 } 1003 1004 /* Conventional zones do not need write plugging. */ 1005 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 1006 /* Zone append to conventional zones is not allowed. */ 1007 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1008 bio_io_error(bio); 1009 return true; 1010 } 1011 return false; 1012 } 1013 1014 if (bio->bi_opf & REQ_NOWAIT) 1015 gfp_mask = GFP_NOWAIT; 1016 1017 zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); 1018 if (!zwplug) { 1019 if (bio->bi_opf & REQ_NOWAIT) 1020 bio_wouldblock_error(bio); 1021 else 1022 bio_io_error(bio); 1023 return true; 1024 } 1025 1026 /* Indicate that this BIO is being handled using zone write plugging. */ 1027 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1028 1029 /* 1030 * If the zone is already plugged, add the BIO to the plug BIO list. 1031 * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a 1032 * BLK_STS_AGAIN failure if we let the BIO execute. 1033 * Otherwise, plug and let the BIO execute. 1034 */ 1035 if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || 1036 (bio->bi_opf & REQ_NOWAIT)) 1037 goto plug; 1038 1039 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1040 spin_unlock_irqrestore(&zwplug->lock, flags); 1041 bio_io_error(bio); 1042 return true; 1043 } 1044 1045 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1046 1047 spin_unlock_irqrestore(&zwplug->lock, flags); 1048 1049 return false; 1050 1051 plug: 1052 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); 1053 1054 spin_unlock_irqrestore(&zwplug->lock, flags); 1055 1056 return true; 1057 } 1058 1059 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) 1060 { 1061 struct gendisk *disk = bio->bi_bdev->bd_disk; 1062 struct blk_zone_wplug *zwplug; 1063 unsigned long flags; 1064 1065 /* 1066 * We have native support for zone append operations, so we are not 1067 * going to handle @bio through plugging. However, we may already have a 1068 * zone write plug for the target zone if that zone was previously 1069 * partially written using regular writes. In such case, we risk leaving 1070 * the plug in the disk hash table if the zone is fully written using 1071 * zone append operations. Avoid this by removing the zone write plug. 1072 */ 1073 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1074 if (likely(!zwplug)) 1075 return; 1076 1077 spin_lock_irqsave(&zwplug->lock, flags); 1078 1079 /* 1080 * We are about to remove the zone write plug. But if the user 1081 * (mistakenly) has issued regular writes together with native zone 1082 * append, we must aborts the writes as otherwise the plugged BIOs would 1083 * not be executed by the plug BIO work as disk_get_zone_wplug() will 1084 * return NULL after the plug is removed. Aborting the plugged write 1085 * BIOs is consistent with the fact that these writes will most likely 1086 * fail anyway as there is no ordering guarantees between zone append 1087 * operations and regular write operations. 1088 */ 1089 if (!bio_list_empty(&zwplug->bio_list)) { 1090 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", 1091 disk->disk_name, zwplug->zone_no); 1092 disk_zone_wplug_abort(zwplug); 1093 } 1094 disk_remove_zone_wplug(disk, zwplug); 1095 spin_unlock_irqrestore(&zwplug->lock, flags); 1096 1097 disk_put_zone_wplug(zwplug); 1098 } 1099 1100 /** 1101 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1102 * @bio: The BIO being submitted 1103 * @nr_segs: The number of physical segments of @bio 1104 * 1105 * Handle write, write zeroes and zone append operations requiring emulation 1106 * using zone write plugging. 1107 * 1108 * Return true whenever @bio execution needs to be delayed through the zone 1109 * write plug. Otherwise, return false to let the submission path process 1110 * @bio normally. 1111 */ 1112 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1113 { 1114 struct block_device *bdev = bio->bi_bdev; 1115 1116 if (!bdev->bd_disk->zone_wplugs_hash) 1117 return false; 1118 1119 /* 1120 * If the BIO already has the plugging flag set, then it was already 1121 * handled through this path and this is a submission from the zone 1122 * plug bio submit work. 1123 */ 1124 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1125 return false; 1126 1127 /* 1128 * We do not need to do anything special for empty flush BIOs, e.g 1129 * BIOs such as issued by blkdev_issue_flush(). The is because it is 1130 * the responsibility of the user to first wait for the completion of 1131 * write operations for flush to have any effect on the persistence of 1132 * the written data. 1133 */ 1134 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) 1135 return false; 1136 1137 /* 1138 * Regular writes and write zeroes need to be handled through the target 1139 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1140 * which may need to go through the flush machinery depending on the 1141 * target device capabilities. Plugging such writes is fine as the flush 1142 * machinery operates at the request level, below the plug, and 1143 * completion of the flush sequence will go through the regular BIO 1144 * completion, which will handle zone write plugging. 1145 * Zone append operations for devices that requested emulation must 1146 * also be plugged so that these BIOs can be changed into regular 1147 * write BIOs. 1148 * Zone reset, reset all and finish commands need special treatment 1149 * to correctly track the write pointer offset of zones. These commands 1150 * are not plugged as we do not need serialization with write 1151 * operations. It is the responsibility of the user to not issue reset 1152 * and finish commands when write operations are in flight. 1153 */ 1154 switch (bio_op(bio)) { 1155 case REQ_OP_ZONE_APPEND: 1156 if (!bdev_emulates_zone_append(bdev)) { 1157 blk_zone_wplug_handle_native_zone_append(bio); 1158 return false; 1159 } 1160 fallthrough; 1161 case REQ_OP_WRITE: 1162 case REQ_OP_WRITE_ZEROES: 1163 return blk_zone_wplug_handle_write(bio, nr_segs); 1164 case REQ_OP_ZONE_RESET: 1165 return blk_zone_wplug_handle_reset_or_finish(bio, 0); 1166 case REQ_OP_ZONE_FINISH: 1167 return blk_zone_wplug_handle_reset_or_finish(bio, 1168 bdev_zone_sectors(bdev)); 1169 case REQ_OP_ZONE_RESET_ALL: 1170 return blk_zone_wplug_handle_reset_all(bio); 1171 default: 1172 return false; 1173 } 1174 1175 return false; 1176 } 1177 EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1178 1179 static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1180 struct blk_zone_wplug *zwplug) 1181 { 1182 unsigned long flags; 1183 1184 spin_lock_irqsave(&zwplug->lock, flags); 1185 1186 /* Schedule submission of the next plugged BIO if we have one. */ 1187 if (!bio_list_empty(&zwplug->bio_list)) { 1188 disk_zone_wplug_schedule_bio_work(disk, zwplug); 1189 spin_unlock_irqrestore(&zwplug->lock, flags); 1190 return; 1191 } 1192 1193 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1194 1195 /* 1196 * If the zone is full (it was fully written or finished, or empty 1197 * (it was reset), remove its zone write plug from the hash table. 1198 */ 1199 if (disk_should_remove_zone_wplug(disk, zwplug)) 1200 disk_remove_zone_wplug(disk, zwplug); 1201 1202 spin_unlock_irqrestore(&zwplug->lock, flags); 1203 } 1204 1205 void blk_zone_write_plug_bio_endio(struct bio *bio) 1206 { 1207 struct gendisk *disk = bio->bi_bdev->bd_disk; 1208 struct blk_zone_wplug *zwplug = 1209 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1210 unsigned long flags; 1211 1212 if (WARN_ON_ONCE(!zwplug)) 1213 return; 1214 1215 /* Make sure we do not see this BIO again by clearing the plug flag. */ 1216 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1217 1218 /* 1219 * If this is a regular write emulating a zone append operation, 1220 * restore the original operation code. 1221 */ 1222 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1223 bio->bi_opf &= ~REQ_OP_MASK; 1224 bio->bi_opf |= REQ_OP_ZONE_APPEND; 1225 } 1226 1227 /* 1228 * If the BIO failed, abort all plugged BIOs and mark the plug as 1229 * needing a write pointer update. 1230 */ 1231 if (bio->bi_status != BLK_STS_OK) { 1232 spin_lock_irqsave(&zwplug->lock, flags); 1233 disk_zone_wplug_abort(zwplug); 1234 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; 1235 spin_unlock_irqrestore(&zwplug->lock, flags); 1236 } 1237 1238 /* Drop the reference we took when the BIO was issued. */ 1239 disk_put_zone_wplug(zwplug); 1240 1241 /* 1242 * For BIO-based devices, blk_zone_write_plug_finish_request() 1243 * is not called. So we need to schedule execution of the next 1244 * plugged BIO here. 1245 */ 1246 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) 1247 disk_zone_wplug_unplug_bio(disk, zwplug); 1248 1249 /* Drop the reference we took when entering this function. */ 1250 disk_put_zone_wplug(zwplug); 1251 } 1252 1253 void blk_zone_write_plug_finish_request(struct request *req) 1254 { 1255 struct gendisk *disk = req->q->disk; 1256 struct blk_zone_wplug *zwplug; 1257 1258 zwplug = disk_get_zone_wplug(disk, req->__sector); 1259 if (WARN_ON_ONCE(!zwplug)) 1260 return; 1261 1262 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1263 1264 /* 1265 * Drop the reference we took when the request was initialized in 1266 * blk_zone_write_plug_init_request(). 1267 */ 1268 disk_put_zone_wplug(zwplug); 1269 1270 disk_zone_wplug_unplug_bio(disk, zwplug); 1271 1272 /* Drop the reference we took when entering this function. */ 1273 disk_put_zone_wplug(zwplug); 1274 } 1275 1276 static void blk_zone_wplug_bio_work(struct work_struct *work) 1277 { 1278 struct blk_zone_wplug *zwplug = 1279 container_of(work, struct blk_zone_wplug, bio_work); 1280 struct block_device *bdev; 1281 unsigned long flags; 1282 struct bio *bio; 1283 1284 /* 1285 * Submit the next plugged BIO. If we do not have any, clear 1286 * the plugged flag. 1287 */ 1288 spin_lock_irqsave(&zwplug->lock, flags); 1289 1290 again: 1291 bio = bio_list_pop(&zwplug->bio_list); 1292 if (!bio) { 1293 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1294 spin_unlock_irqrestore(&zwplug->lock, flags); 1295 goto put_zwplug; 1296 } 1297 1298 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1299 blk_zone_wplug_bio_io_error(zwplug, bio); 1300 goto again; 1301 } 1302 1303 spin_unlock_irqrestore(&zwplug->lock, flags); 1304 1305 bdev = bio->bi_bdev; 1306 submit_bio_noacct_nocheck(bio); 1307 1308 /* 1309 * blk-mq devices will reuse the extra reference on the request queue 1310 * usage counter we took when the BIO was plugged, but the submission 1311 * path for BIO-based devices will not do that. So drop this extra 1312 * reference here. 1313 */ 1314 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) 1315 blk_queue_exit(bdev->bd_disk->queue); 1316 1317 put_zwplug: 1318 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ 1319 disk_put_zone_wplug(zwplug); 1320 } 1321 1322 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 1323 { 1324 return 1U << disk->zone_wplugs_hash_bits; 1325 } 1326 1327 void disk_init_zone_resources(struct gendisk *disk) 1328 { 1329 spin_lock_init(&disk->zone_wplugs_lock); 1330 } 1331 1332 /* 1333 * For the size of a disk zone write plug hash table, use the size of the 1334 * zone write plug mempool, which is the maximum of the disk open zones and 1335 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1336 * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1337 */ 1338 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1339 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1340 1341 static int disk_alloc_zone_resources(struct gendisk *disk, 1342 unsigned int pool_size) 1343 { 1344 unsigned int i; 1345 1346 atomic_set(&disk->nr_zone_wplugs, 0); 1347 disk->zone_wplugs_hash_bits = 1348 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1349 1350 disk->zone_wplugs_hash = 1351 kcalloc(disk_zone_wplugs_hash_size(disk), 1352 sizeof(struct hlist_head), GFP_KERNEL); 1353 if (!disk->zone_wplugs_hash) 1354 return -ENOMEM; 1355 1356 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1357 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1358 1359 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1360 sizeof(struct blk_zone_wplug)); 1361 if (!disk->zone_wplugs_pool) 1362 goto free_hash; 1363 1364 disk->zone_wplugs_wq = 1365 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1366 pool_size, disk->disk_name); 1367 if (!disk->zone_wplugs_wq) 1368 goto destroy_pool; 1369 1370 return 0; 1371 1372 destroy_pool: 1373 mempool_destroy(disk->zone_wplugs_pool); 1374 disk->zone_wplugs_pool = NULL; 1375 free_hash: 1376 kfree(disk->zone_wplugs_hash); 1377 disk->zone_wplugs_hash = NULL; 1378 disk->zone_wplugs_hash_bits = 0; 1379 return -ENOMEM; 1380 } 1381 1382 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1383 { 1384 struct blk_zone_wplug *zwplug; 1385 unsigned int i; 1386 1387 if (!disk->zone_wplugs_hash) 1388 return; 1389 1390 /* Free all the zone write plugs we have. */ 1391 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1392 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1393 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1394 struct blk_zone_wplug, node); 1395 refcount_inc(&zwplug->ref); 1396 disk_remove_zone_wplug(disk, zwplug); 1397 disk_put_zone_wplug(zwplug); 1398 } 1399 } 1400 1401 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); 1402 kfree(disk->zone_wplugs_hash); 1403 disk->zone_wplugs_hash = NULL; 1404 disk->zone_wplugs_hash_bits = 0; 1405 } 1406 1407 static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, 1408 unsigned long *bitmap) 1409 { 1410 unsigned int nr_conv_zones = 0; 1411 unsigned long flags; 1412 1413 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 1414 if (bitmap) 1415 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); 1416 bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, 1417 lockdep_is_held(&disk->zone_wplugs_lock)); 1418 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 1419 1420 kfree_rcu_mightsleep(bitmap); 1421 1422 return nr_conv_zones; 1423 } 1424 1425 void disk_free_zone_resources(struct gendisk *disk) 1426 { 1427 if (!disk->zone_wplugs_pool) 1428 return; 1429 1430 if (disk->zone_wplugs_wq) { 1431 destroy_workqueue(disk->zone_wplugs_wq); 1432 disk->zone_wplugs_wq = NULL; 1433 } 1434 1435 disk_destroy_zone_wplugs_hash_table(disk); 1436 1437 /* 1438 * Wait for the zone write plugs to be RCU-freed before 1439 * destorying the mempool. 1440 */ 1441 rcu_barrier(); 1442 1443 mempool_destroy(disk->zone_wplugs_pool); 1444 disk->zone_wplugs_pool = NULL; 1445 1446 disk_set_conv_zones_bitmap(disk, NULL); 1447 disk->zone_capacity = 0; 1448 disk->last_zone_capacity = 0; 1449 disk->nr_zones = 0; 1450 } 1451 1452 static inline bool disk_need_zone_resources(struct gendisk *disk) 1453 { 1454 /* 1455 * All mq zoned devices need zone resources so that the block layer 1456 * can automatically handle write BIO plugging. BIO-based device drivers 1457 * (e.g. DM devices) are normally responsible for handling zone write 1458 * ordering and do not need zone resources, unless the driver requires 1459 * zone append emulation. 1460 */ 1461 return queue_is_mq(disk->queue) || 1462 queue_emulates_zone_append(disk->queue); 1463 } 1464 1465 static int disk_revalidate_zone_resources(struct gendisk *disk, 1466 unsigned int nr_zones) 1467 { 1468 struct queue_limits *lim = &disk->queue->limits; 1469 unsigned int pool_size; 1470 1471 if (!disk_need_zone_resources(disk)) 1472 return 0; 1473 1474 /* 1475 * If the device has no limit on the maximum number of open and active 1476 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 1477 */ 1478 pool_size = max(lim->max_open_zones, lim->max_active_zones); 1479 if (!pool_size) 1480 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); 1481 1482 if (!disk->zone_wplugs_hash) 1483 return disk_alloc_zone_resources(disk, pool_size); 1484 1485 return 0; 1486 } 1487 1488 struct blk_revalidate_zone_args { 1489 struct gendisk *disk; 1490 unsigned long *conv_zones_bitmap; 1491 unsigned int nr_zones; 1492 unsigned int zone_capacity; 1493 unsigned int last_zone_capacity; 1494 sector_t sector; 1495 }; 1496 1497 /* 1498 * Update the disk zone resources information and device queue limits. 1499 * The disk queue is frozen when this is executed. 1500 */ 1501 static int disk_update_zone_resources(struct gendisk *disk, 1502 struct blk_revalidate_zone_args *args) 1503 { 1504 struct request_queue *q = disk->queue; 1505 unsigned int nr_seq_zones, nr_conv_zones; 1506 unsigned int pool_size; 1507 struct queue_limits lim; 1508 1509 disk->nr_zones = args->nr_zones; 1510 disk->zone_capacity = args->zone_capacity; 1511 disk->last_zone_capacity = args->last_zone_capacity; 1512 nr_conv_zones = 1513 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); 1514 if (nr_conv_zones >= disk->nr_zones) { 1515 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 1516 disk->disk_name, nr_conv_zones, disk->nr_zones); 1517 return -ENODEV; 1518 } 1519 1520 lim = queue_limits_start_update(q); 1521 1522 /* 1523 * Some devices can advertize zone resource limits that are larger than 1524 * the number of sequential zones of the zoned block device, e.g. a 1525 * small ZNS namespace. For such case, assume that the zoned device has 1526 * no zone resource limits. 1527 */ 1528 nr_seq_zones = disk->nr_zones - nr_conv_zones; 1529 if (lim.max_open_zones >= nr_seq_zones) 1530 lim.max_open_zones = 0; 1531 if (lim.max_active_zones >= nr_seq_zones) 1532 lim.max_active_zones = 0; 1533 1534 if (!disk->zone_wplugs_pool) 1535 goto commit; 1536 1537 /* 1538 * If the device has no limit on the maximum number of open and active 1539 * zones, set its max open zone limit to the mempool size to indicate 1540 * to the user that there is a potential performance impact due to 1541 * dynamic zone write plug allocation when simultaneously writing to 1542 * more zones than the size of the mempool. 1543 */ 1544 pool_size = max(lim.max_open_zones, lim.max_active_zones); 1545 if (!pool_size) 1546 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 1547 1548 mempool_resize(disk->zone_wplugs_pool, pool_size); 1549 1550 if (!lim.max_open_zones && !lim.max_active_zones) { 1551 if (pool_size < nr_seq_zones) 1552 lim.max_open_zones = pool_size; 1553 else 1554 lim.max_open_zones = 0; 1555 } 1556 1557 commit: 1558 return queue_limits_commit_update_frozen(q, &lim); 1559 } 1560 1561 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 1562 struct blk_revalidate_zone_args *args) 1563 { 1564 struct gendisk *disk = args->disk; 1565 1566 if (zone->capacity != zone->len) { 1567 pr_warn("%s: Invalid conventional zone capacity\n", 1568 disk->disk_name); 1569 return -ENODEV; 1570 } 1571 1572 if (disk_zone_is_last(disk, zone)) 1573 args->last_zone_capacity = zone->capacity; 1574 1575 if (!disk_need_zone_resources(disk)) 1576 return 0; 1577 1578 if (!args->conv_zones_bitmap) { 1579 args->conv_zones_bitmap = 1580 bitmap_zalloc(args->nr_zones, GFP_NOIO); 1581 if (!args->conv_zones_bitmap) 1582 return -ENOMEM; 1583 } 1584 1585 set_bit(idx, args->conv_zones_bitmap); 1586 1587 return 0; 1588 } 1589 1590 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 1591 struct blk_revalidate_zone_args *args) 1592 { 1593 struct gendisk *disk = args->disk; 1594 struct blk_zone_wplug *zwplug; 1595 unsigned int wp_offset; 1596 unsigned long flags; 1597 1598 /* 1599 * Remember the capacity of the first sequential zone and check 1600 * if it is constant for all zones, ignoring the last zone as it can be 1601 * smaller. 1602 */ 1603 if (!args->zone_capacity) 1604 args->zone_capacity = zone->capacity; 1605 if (disk_zone_is_last(disk, zone)) { 1606 args->last_zone_capacity = zone->capacity; 1607 } else if (zone->capacity != args->zone_capacity) { 1608 pr_warn("%s: Invalid variable zone capacity\n", 1609 disk->disk_name); 1610 return -ENODEV; 1611 } 1612 1613 /* 1614 * If the device needs zone append emulation, we need to track the 1615 * write pointer of all zones that are not empty nor full. So make sure 1616 * we have a zone write plug for such zone if the device has a zone 1617 * write plug hash table. 1618 */ 1619 if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) 1620 return 0; 1621 1622 disk_zone_wplug_sync_wp_offset(disk, zone); 1623 1624 wp_offset = blk_zone_wp_offset(zone); 1625 if (!wp_offset || wp_offset >= zone->capacity) 1626 return 0; 1627 1628 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); 1629 if (!zwplug) 1630 return -ENOMEM; 1631 spin_unlock_irqrestore(&zwplug->lock, flags); 1632 disk_put_zone_wplug(zwplug); 1633 1634 return 0; 1635 } 1636 1637 /* 1638 * Helper function to check the validity of zones of a zoned block device. 1639 */ 1640 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 1641 void *data) 1642 { 1643 struct blk_revalidate_zone_args *args = data; 1644 struct gendisk *disk = args->disk; 1645 sector_t zone_sectors = disk->queue->limits.chunk_sectors; 1646 int ret; 1647 1648 /* Check for bad zones and holes in the zone report */ 1649 if (zone->start != args->sector) { 1650 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 1651 disk->disk_name, args->sector, zone->start); 1652 return -ENODEV; 1653 } 1654 1655 if (zone->start >= get_capacity(disk) || !zone->len) { 1656 pr_warn("%s: Invalid zone start %llu, length %llu\n", 1657 disk->disk_name, zone->start, zone->len); 1658 return -ENODEV; 1659 } 1660 1661 /* 1662 * All zones must have the same size, with the exception on an eventual 1663 * smaller last zone. 1664 */ 1665 if (!disk_zone_is_last(disk, zone)) { 1666 if (zone->len != zone_sectors) { 1667 pr_warn("%s: Invalid zoned device with non constant zone size\n", 1668 disk->disk_name); 1669 return -ENODEV; 1670 } 1671 } else if (zone->len > zone_sectors) { 1672 pr_warn("%s: Invalid zoned device with larger last zone size\n", 1673 disk->disk_name); 1674 return -ENODEV; 1675 } 1676 1677 if (!zone->capacity || zone->capacity > zone->len) { 1678 pr_warn("%s: Invalid zone capacity\n", 1679 disk->disk_name); 1680 return -ENODEV; 1681 } 1682 1683 /* Check zone type */ 1684 switch (zone->type) { 1685 case BLK_ZONE_TYPE_CONVENTIONAL: 1686 ret = blk_revalidate_conv_zone(zone, idx, args); 1687 break; 1688 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1689 ret = blk_revalidate_seq_zone(zone, idx, args); 1690 break; 1691 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1692 default: 1693 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 1694 disk->disk_name, (int)zone->type, zone->start); 1695 ret = -ENODEV; 1696 } 1697 1698 if (!ret) 1699 args->sector += zone->len; 1700 1701 return ret; 1702 } 1703 1704 /** 1705 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 1706 * @disk: Target disk 1707 * 1708 * Helper function for low-level device drivers to check, (re) allocate and 1709 * initialize resources used for managing zoned disks. This function should 1710 * normally be called by blk-mq based drivers when a zoned gendisk is probed 1711 * and when the zone configuration of the gendisk changes (e.g. after a format). 1712 * Before calling this function, the device driver must already have set the 1713 * device zone size (chunk_sector limit) and the max zone append limit. 1714 * BIO based drivers can also use this function as long as the device queue 1715 * can be safely frozen. 1716 */ 1717 int blk_revalidate_disk_zones(struct gendisk *disk) 1718 { 1719 struct request_queue *q = disk->queue; 1720 sector_t zone_sectors = q->limits.chunk_sectors; 1721 sector_t capacity = get_capacity(disk); 1722 struct blk_revalidate_zone_args args = { }; 1723 unsigned int noio_flag; 1724 int ret = -ENOMEM; 1725 1726 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 1727 return -EIO; 1728 1729 if (!capacity) 1730 return -ENODEV; 1731 1732 /* 1733 * Checks that the device driver indicated a valid zone size and that 1734 * the max zone append limit is set. 1735 */ 1736 if (!zone_sectors || !is_power_of_2(zone_sectors)) { 1737 pr_warn("%s: Invalid non power of two zone size (%llu)\n", 1738 disk->disk_name, zone_sectors); 1739 return -ENODEV; 1740 } 1741 1742 /* 1743 * Ensure that all memory allocations in this context are done as if 1744 * GFP_NOIO was specified. 1745 */ 1746 args.disk = disk; 1747 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); 1748 noio_flag = memalloc_noio_save(); 1749 ret = disk_revalidate_zone_resources(disk, args.nr_zones); 1750 if (ret) { 1751 memalloc_noio_restore(noio_flag); 1752 return ret; 1753 } 1754 1755 ret = disk->fops->report_zones(disk, 0, UINT_MAX, 1756 blk_revalidate_zone_cb, &args); 1757 if (!ret) { 1758 pr_warn("%s: No zones reported\n", disk->disk_name); 1759 ret = -ENODEV; 1760 } 1761 memalloc_noio_restore(noio_flag); 1762 1763 /* 1764 * If zones where reported, make sure that the entire disk capacity 1765 * has been checked. 1766 */ 1767 if (ret > 0 && args.sector != capacity) { 1768 pr_warn("%s: Missing zones from sector %llu\n", 1769 disk->disk_name, args.sector); 1770 ret = -ENODEV; 1771 } 1772 1773 /* 1774 * Set the new disk zone parameters only once the queue is frozen and 1775 * all I/Os are completed. 1776 */ 1777 if (ret > 0) 1778 ret = disk_update_zone_resources(disk, &args); 1779 else 1780 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 1781 if (ret) { 1782 unsigned int memflags = blk_mq_freeze_queue(q); 1783 1784 disk_free_zone_resources(disk); 1785 blk_mq_unfreeze_queue(q, memflags); 1786 } 1787 1788 return ret; 1789 } 1790 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 1791 1792 /** 1793 * blk_zone_issue_zeroout - zero-fill a block range in a zone 1794 * @bdev: blockdev to write 1795 * @sector: start sector 1796 * @nr_sects: number of sectors to write 1797 * @gfp_mask: memory allocation flags (for bio_alloc) 1798 * 1799 * Description: 1800 * Zero-fill a block range in a zone (@sector must be equal to the zone write 1801 * pointer), handling potential errors due to the (initially unknown) lack of 1802 * hardware offload (See blkdev_issue_zeroout()). 1803 */ 1804 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, 1805 sector_t nr_sects, gfp_t gfp_mask) 1806 { 1807 int ret; 1808 1809 if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) 1810 return -EIO; 1811 1812 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 1813 BLKDEV_ZERO_NOFALLBACK); 1814 if (ret != -EOPNOTSUPP) 1815 return ret; 1816 1817 /* 1818 * The failed call to blkdev_issue_zeroout() advanced the zone write 1819 * pointer. Undo this using a report zone to update the zone write 1820 * pointer to the correct current value. 1821 */ 1822 ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); 1823 if (ret != 1) 1824 return ret < 0 ? ret : -EIO; 1825 1826 /* 1827 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a 1828 * regular write with zero-pages. 1829 */ 1830 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); 1831 } 1832 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); 1833 1834 #ifdef CONFIG_BLK_DEBUG_FS 1835 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, 1836 struct seq_file *m) 1837 { 1838 unsigned int zwp_wp_offset, zwp_flags; 1839 unsigned int zwp_zone_no, zwp_ref; 1840 unsigned int zwp_bio_list_size; 1841 unsigned long flags; 1842 1843 spin_lock_irqsave(&zwplug->lock, flags); 1844 zwp_zone_no = zwplug->zone_no; 1845 zwp_flags = zwplug->flags; 1846 zwp_ref = refcount_read(&zwplug->ref); 1847 zwp_wp_offset = zwplug->wp_offset; 1848 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 1849 spin_unlock_irqrestore(&zwplug->lock, flags); 1850 1851 seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, 1852 zwp_wp_offset, zwp_bio_list_size); 1853 } 1854 1855 int queue_zone_wplugs_show(void *data, struct seq_file *m) 1856 { 1857 struct request_queue *q = data; 1858 struct gendisk *disk = q->disk; 1859 struct blk_zone_wplug *zwplug; 1860 unsigned int i; 1861 1862 if (!disk->zone_wplugs_hash) 1863 return 0; 1864 1865 rcu_read_lock(); 1866 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1867 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], 1868 node) 1869 queue_zone_wplug_show(zwplug, m); 1870 rcu_read_unlock(); 1871 1872 return 0; 1873 } 1874 1875 #endif 1876