1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Zoned block device handling
4 *
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
7 *
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11 */
12
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19 #include <linux/kthread.h>
20 #include <linux/freezer.h>
21
22 #include <trace/events/block.h>
23
24 #include "blk.h"
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
27
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name[] = {
30 ZONE_COND_NAME(NOT_WP),
31 ZONE_COND_NAME(EMPTY),
32 ZONE_COND_NAME(IMP_OPEN),
33 ZONE_COND_NAME(EXP_OPEN),
34 ZONE_COND_NAME(CLOSED),
35 ZONE_COND_NAME(READONLY),
36 ZONE_COND_NAME(FULL),
37 ZONE_COND_NAME(OFFLINE),
38 ZONE_COND_NAME(ACTIVE),
39 };
40 #undef ZONE_COND_NAME
41
42 /*
43 * Per-zone write plug.
44 * @node: hlist_node structure for managing the plug using a hash table.
45 * @entry: list_head structure for listing the plug in the disk list of active
46 * zone write plugs.
47 * @bio_list: The list of BIOs that are currently plugged.
48 * @bio_work: Work struct to handle issuing of plugged BIOs
49 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
50 * @disk: The gendisk the plug belongs to.
51 * @lock: Spinlock to atomically manipulate the plug.
52 * @ref: Zone write plug reference counter. A zone write plug reference is
53 * always at least 1 when the plug is hashed in the disk plug hash table.
54 * The reference is incremented whenever a new BIO needing plugging is
55 * submitted and when a function needs to manipulate a plug. The
56 * reference count is decremented whenever a plugged BIO completes and
57 * when a function that referenced the plug returns. The initial
58 * reference is dropped whenever the zone of the zone write plug is reset,
59 * finished and when the zone becomes full (last write BIO to the zone
60 * completes).
61 * @flags: Flags indicating the plug state.
62 * @zone_no: The number of the zone the plug is managing.
63 * @wp_offset: The zone write pointer location relative to the start of the zone
64 * as a number of 512B sectors.
65 * @cond: Condition of the zone
66 */
67 struct blk_zone_wplug {
68 struct hlist_node node;
69 struct list_head entry;
70 struct bio_list bio_list;
71 struct work_struct bio_work;
72 struct rcu_head rcu_head;
73 struct gendisk *disk;
74 spinlock_t lock;
75 refcount_t ref;
76 unsigned int flags;
77 unsigned int zone_no;
78 unsigned int wp_offset;
79 enum blk_zone_cond cond;
80 };
81
disk_need_zone_resources(struct gendisk * disk)82 static inline bool disk_need_zone_resources(struct gendisk *disk)
83 {
84 /*
85 * All request-based zoned devices need zone resources so that the
86 * block layer can automatically handle write BIO plugging. BIO-based
87 * device drivers (e.g. DM devices) are normally responsible for
88 * handling zone write ordering and do not need zone resources, unless
89 * the driver requires zone append emulation.
90 */
91 return queue_is_mq(disk->queue) ||
92 queue_emulates_zone_append(disk->queue);
93 }
94
disk_zone_wplugs_hash_size(struct gendisk * disk)95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
96 {
97 return 1U << disk->zone_wplugs_hash_bits;
98 }
99
100 /*
101 * Zone write plug flags bits:
102 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
103 * that is, that write BIOs are being throttled due to a write BIO already
104 * being executed or the zone write plug bio list is not empty.
105 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
106 * write pointer offset and need to update it.
107 * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
108 * removed from the disk hash table of zone write plugs when the last
109 * reference on the zone write plug is dropped. If set, this flag also
110 * indicates that the initial extra reference on the zone write plug was
111 * dropped, meaning that the reference count indicates the current number of
112 * active users (code context or BIOs and requests in flight). This flag is
113 * set when a zone is reset, finished or becomes full.
114 */
115 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
117 #define BLK_ZONE_WPLUG_DEAD (1U << 2)
118
119 /**
120 * blk_zone_cond_str - Return a zone condition name string
121 * @zone_cond: a zone condition BLK_ZONE_COND_name
122 *
123 * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
124 * for the debugging and tracing zone conditions. For an invalid zone
125 * conditions, the string "UNKNOWN" is returned.
126 */
blk_zone_cond_str(enum blk_zone_cond zone_cond)127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
128 {
129 static const char *zone_cond_str = "UNKNOWN";
130
131 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
132 zone_cond_str = zone_cond_name[zone_cond];
133
134 return zone_cond_str;
135 }
136 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
137
blk_zone_set_cond(u8 * zones_cond,unsigned int zno,enum blk_zone_cond cond)138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
139 enum blk_zone_cond cond)
140 {
141 if (!zones_cond)
142 return;
143
144 switch (cond) {
145 case BLK_ZONE_COND_IMP_OPEN:
146 case BLK_ZONE_COND_EXP_OPEN:
147 case BLK_ZONE_COND_CLOSED:
148 zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
149 return;
150 case BLK_ZONE_COND_NOT_WP:
151 case BLK_ZONE_COND_EMPTY:
152 case BLK_ZONE_COND_FULL:
153 case BLK_ZONE_COND_OFFLINE:
154 case BLK_ZONE_COND_READONLY:
155 default:
156 zones_cond[zno] = cond;
157 return;
158 }
159 }
160
disk_zone_set_cond(struct gendisk * disk,sector_t sector,enum blk_zone_cond cond)161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
162 enum blk_zone_cond cond)
163 {
164 u8 *zones_cond;
165
166 rcu_read_lock();
167 zones_cond = rcu_dereference(disk->zones_cond);
168 if (zones_cond) {
169 unsigned int zno = disk_zone_no(disk, sector);
170
171 /*
172 * The condition of a conventional, readonly and offline zones
173 * never changes, so do nothing if the target zone is in one of
174 * these conditions.
175 */
176 switch (zones_cond[zno]) {
177 case BLK_ZONE_COND_NOT_WP:
178 case BLK_ZONE_COND_READONLY:
179 case BLK_ZONE_COND_OFFLINE:
180 break;
181 default:
182 blk_zone_set_cond(zones_cond, zno, cond);
183 break;
184 }
185 }
186 rcu_read_unlock();
187 }
188
189 /**
190 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
191 * @bdev: block device to check
192 * @sector: sector number
193 *
194 * Check if @sector on @bdev is contained in a sequential write required zone.
195 */
bdev_zone_is_seq(struct block_device * bdev,sector_t sector)196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
197 {
198 struct gendisk *disk = bdev->bd_disk;
199 unsigned int zno = disk_zone_no(disk, sector);
200 bool is_seq = false;
201 u8 *zones_cond;
202
203 if (!bdev_is_zoned(bdev))
204 return false;
205
206 rcu_read_lock();
207 zones_cond = rcu_dereference(disk->zones_cond);
208 if (zones_cond && zno < disk->nr_zones)
209 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
210 rcu_read_unlock();
211
212 return is_seq;
213 }
214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
215
216 /*
217 * Zone report arguments for block device drivers report_zones operation.
218 * @cb: report_zones_cb callback for each reported zone.
219 * @data: Private data passed to report_zones_cb.
220 */
221 struct blk_report_zones_args {
222 report_zones_cb cb;
223 void *data;
224 bool report_active;
225 };
226
blkdev_do_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
228 unsigned int nr_zones,
229 struct blk_report_zones_args *args)
230 {
231 struct gendisk *disk = bdev->bd_disk;
232
233 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
234 return -EOPNOTSUPP;
235
236 if (!nr_zones || sector >= get_capacity(disk))
237 return 0;
238
239 return disk->fops->report_zones(disk, sector, nr_zones, args);
240 }
241
242 /**
243 * blkdev_report_zones - Get zones information
244 * @bdev: Target block device
245 * @sector: Sector from which to report zones
246 * @nr_zones: Maximum number of zones to report
247 * @cb: Callback function called for each reported zone
248 * @data: Private data for the callback
249 *
250 * Description:
251 * Get zone information starting from the zone containing @sector for at most
252 * @nr_zones, and call @cb for each zone reported by the device.
253 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
254 * constant can be passed to @nr_zones.
255 * Returns the number of zones reported by the device, or a negative errno
256 * value in case of failure.
257 *
258 * Note: The caller must use memalloc_noXX_save/restore() calls to control
259 * memory allocations done within this function.
260 */
blkdev_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)261 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
262 unsigned int nr_zones, report_zones_cb cb, void *data)
263 {
264 struct blk_report_zones_args args = {
265 .cb = cb,
266 .data = data,
267 };
268
269 return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
270 }
271 EXPORT_SYMBOL_GPL(blkdev_report_zones);
272
blkdev_zone_reset_all(struct block_device * bdev)273 static int blkdev_zone_reset_all(struct block_device *bdev)
274 {
275 struct bio bio;
276
277 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
278 trace_blkdev_zone_mgmt(&bio, 0);
279 return submit_bio_wait(&bio);
280 }
281
282 /**
283 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
284 * @bdev: Target block device
285 * @op: Operation to be performed on the zones
286 * @sector: Start sector of the first zone to operate on
287 * @nr_sectors: Number of sectors, should be at least the length of one zone and
288 * must be zone size aligned.
289 *
290 * Description:
291 * Perform the specified operation on the range of zones specified by
292 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
293 * is valid, but the specified range should not contain conventional zones.
294 * The operation to execute on each zone can be a zone reset, open, close
295 * or finish request.
296 */
blkdev_zone_mgmt(struct block_device * bdev,enum req_op op,sector_t sector,sector_t nr_sectors)297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
298 sector_t sector, sector_t nr_sectors)
299 {
300 sector_t zone_sectors = bdev_zone_sectors(bdev);
301 sector_t capacity = bdev_nr_sectors(bdev);
302 sector_t end_sector = sector + nr_sectors;
303 struct bio *bio = NULL;
304 int ret = 0;
305
306 if (!bdev_is_zoned(bdev))
307 return -EOPNOTSUPP;
308
309 if (bdev_read_only(bdev))
310 return -EPERM;
311
312 if (!op_is_zone_mgmt(op))
313 return -EOPNOTSUPP;
314
315 if (end_sector <= sector || end_sector > capacity)
316 /* Out of range */
317 return -EINVAL;
318
319 /* Check alignment (handle eventual smaller last zone) */
320 if (!bdev_is_zone_start(bdev, sector))
321 return -EINVAL;
322
323 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
324 return -EINVAL;
325
326 /*
327 * In the case of a zone reset operation over all zones, use
328 * REQ_OP_ZONE_RESET_ALL.
329 */
330 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
331 return blkdev_zone_reset_all(bdev);
332
333 while (sector < end_sector) {
334 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
335 bio->bi_iter.bi_sector = sector;
336 sector += zone_sectors;
337
338 /* This may take a while, so be nice to others */
339 cond_resched();
340 }
341
342 trace_blkdev_zone_mgmt(bio, nr_sectors);
343 ret = submit_bio_wait(bio);
344 bio_put(bio);
345
346 return ret;
347 }
348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
349
350 struct zone_report_args {
351 struct blk_zone __user *zones;
352 };
353
blkdev_copy_zone_to_user(struct blk_zone * zone,unsigned int idx,void * data)354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
355 void *data)
356 {
357 struct zone_report_args *args = data;
358
359 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
360 return -EFAULT;
361 return 0;
362 }
363
364 /*
365 * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
366 */
367 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED
368
369 /*
370 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
371 * Called from blkdev_ioctl.
372 */
blkdev_report_zones_ioctl(struct block_device * bdev,unsigned int cmd,unsigned long arg)373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
374 unsigned long arg)
375 {
376 void __user *argp = (void __user *)arg;
377 struct zone_report_args args;
378 struct blk_zone_report rep;
379 int ret;
380
381 if (!argp)
382 return -EINVAL;
383
384 if (!bdev_is_zoned(bdev))
385 return -ENOTTY;
386
387 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
388 return -EFAULT;
389
390 if (!rep.nr_zones)
391 return -EINVAL;
392
393 args.zones = argp + sizeof(struct blk_zone_report);
394
395 switch (cmd) {
396 case BLKREPORTZONE:
397 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
398 blkdev_copy_zone_to_user, &args);
399 break;
400 case BLKREPORTZONEV2:
401 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
402 return -EINVAL;
403 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
404 blkdev_copy_zone_to_user, &args);
405 break;
406 default:
407 return -EINVAL;
408 }
409
410 if (ret < 0)
411 return ret;
412
413 rep.nr_zones = ret;
414 rep.flags = BLK_ZONE_REP_CAPACITY;
415 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
416 return -EFAULT;
417 return 0;
418 }
419
blkdev_reset_zone(struct block_device * bdev,blk_mode_t mode,struct blk_zone_range * zrange)420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
421 struct blk_zone_range *zrange)
422 {
423 loff_t start, end;
424 int ret = -EINVAL;
425
426 inode_lock(bdev->bd_mapping->host);
427 filemap_invalidate_lock(bdev->bd_mapping);
428 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
429 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
430 /* Out of range */
431 goto out_unlock;
432
433 start = zrange->sector << SECTOR_SHIFT;
434 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
435
436 ret = truncate_bdev_range(bdev, mode, start, end);
437 if (ret)
438 goto out_unlock;
439
440 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
441 zrange->nr_sectors);
442 out_unlock:
443 filemap_invalidate_unlock(bdev->bd_mapping);
444 inode_unlock(bdev->bd_mapping->host);
445 return ret;
446 }
447
448 /*
449 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
450 * Called from blkdev_ioctl.
451 */
blkdev_zone_mgmt_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
453 unsigned int cmd, unsigned long arg)
454 {
455 void __user *argp = (void __user *)arg;
456 struct blk_zone_range zrange;
457 enum req_op op;
458
459 if (!argp)
460 return -EINVAL;
461
462 if (!bdev_is_zoned(bdev))
463 return -ENOTTY;
464
465 if (!(mode & BLK_OPEN_WRITE))
466 return -EBADF;
467
468 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
469 return -EFAULT;
470
471 switch (cmd) {
472 case BLKRESETZONE:
473 return blkdev_reset_zone(bdev, mode, &zrange);
474 case BLKOPENZONE:
475 op = REQ_OP_ZONE_OPEN;
476 break;
477 case BLKCLOSEZONE:
478 op = REQ_OP_ZONE_CLOSE;
479 break;
480 case BLKFINISHZONE:
481 op = REQ_OP_ZONE_FINISH;
482 break;
483 default:
484 return -ENOTTY;
485 }
486
487 return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
488 }
489
disk_zone_is_last(struct gendisk * disk,struct blk_zone * zone)490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
491 {
492 return zone->start + zone->len >= get_capacity(disk);
493 }
494
disk_zone_wplug_is_full(struct gendisk * disk,struct blk_zone_wplug * zwplug)495 static bool disk_zone_wplug_is_full(struct gendisk *disk,
496 struct blk_zone_wplug *zwplug)
497 {
498 if (zwplug->zone_no < disk->nr_zones - 1)
499 return zwplug->wp_offset >= disk->zone_capacity;
500 return zwplug->wp_offset >= disk->last_zone_capacity;
501 }
502
disk_insert_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)503 static bool disk_insert_zone_wplug(struct gendisk *disk,
504 struct blk_zone_wplug *zwplug)
505 {
506 struct blk_zone_wplug *zwplg;
507 unsigned long flags;
508 u8 *zones_cond;
509 unsigned int idx =
510 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
511
512 /*
513 * Add the new zone write plug to the hash table, but carefully as we
514 * are racing with other submission context, so we may already have a
515 * zone write plug for the same zone.
516 */
517 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
518 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
519 if (zwplg->zone_no == zwplug->zone_no) {
520 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
521 flags);
522 return false;
523 }
524 }
525
526 /*
527 * Set the zone condition: if we do not yet have a zones_cond array
528 * attached to the disk, then this is a zone write plug insert from the
529 * first call to blk_revalidate_disk_zones(), in which case the zone is
530 * necessarilly in the active condition.
531 */
532 zones_cond = rcu_dereference_check(disk->zones_cond,
533 lockdep_is_held(&disk->zone_wplugs_hash_lock));
534 if (zones_cond)
535 zwplug->cond = zones_cond[zwplug->zone_no];
536 else
537 zwplug->cond = BLK_ZONE_COND_ACTIVE;
538
539 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
540 atomic_inc(&disk->nr_zone_wplugs);
541 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
542
543 return true;
544 }
545
disk_get_hashed_zone_wplug(struct gendisk * disk,sector_t sector)546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
547 sector_t sector)
548 {
549 unsigned int zno = disk_zone_no(disk, sector);
550 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
551 struct blk_zone_wplug *zwplug;
552
553 rcu_read_lock();
554
555 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
556 if (zwplug->zone_no == zno &&
557 refcount_inc_not_zero(&zwplug->ref)) {
558 rcu_read_unlock();
559 return zwplug;
560 }
561 }
562
563 rcu_read_unlock();
564
565 return NULL;
566 }
567
disk_get_zone_wplug(struct gendisk * disk,sector_t sector)568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
569 sector_t sector)
570 {
571 if (!atomic_read(&disk->nr_zone_wplugs))
572 return NULL;
573
574 return disk_get_hashed_zone_wplug(disk, sector);
575 }
576
disk_free_zone_wplug_rcu(struct rcu_head * rcu_head)577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
578 {
579 struct blk_zone_wplug *zwplug =
580 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
581
582 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
583 }
584
disk_free_zone_wplug(struct blk_zone_wplug * zwplug)585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
586 {
587 struct gendisk *disk = zwplug->disk;
588 unsigned long flags;
589
590 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
591 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
592 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
593
594 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
595 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
596 lockdep_is_held(&disk->zone_wplugs_hash_lock)),
597 zwplug->zone_no, zwplug->cond);
598 hlist_del_init_rcu(&zwplug->node);
599 atomic_dec(&disk->nr_zone_wplugs);
600 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
601
602 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
603 }
604
disk_put_zone_wplug(struct blk_zone_wplug * zwplug)605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
606 {
607 if (refcount_dec_and_test(&zwplug->ref))
608 disk_free_zone_wplug(zwplug);
609 }
610
611 /*
612 * Flag the zone write plug as dead and drop the initial reference we got when
613 * the zone write plug was added to the hash table. The zone write plug will be
614 * unhashed when its last reference is dropped.
615 */
disk_mark_zone_wplug_dead(struct blk_zone_wplug * zwplug)616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
617 {
618 lockdep_assert_held(&zwplug->lock);
619
620 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
621 zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
622 disk_put_zone_wplug(zwplug);
623 }
624 }
625
626 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
627 struct blk_zone_wplug *zwplug);
628
blk_zone_wplug_bio_work(struct work_struct * work)629 static void blk_zone_wplug_bio_work(struct work_struct *work)
630 {
631 struct blk_zone_wplug *zwplug =
632 container_of(work, struct blk_zone_wplug, bio_work);
633
634 disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
635
636 /* Drop the reference we took in disk_zone_wplug_schedule_work(). */
637 disk_put_zone_wplug(zwplug);
638 }
639
640 /*
641 * Get a zone write plug for the zone containing @sector.
642 * If the plug does not exist, it is allocated and inserted in the disk hash
643 * table.
644 */
disk_get_or_alloc_zone_wplug(struct gendisk * disk,sector_t sector,gfp_t gfp_mask)645 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
646 sector_t sector, gfp_t gfp_mask)
647 {
648 unsigned int zno = disk_zone_no(disk, sector);
649 struct blk_zone_wplug *zwplug;
650
651 again:
652 zwplug = disk_get_zone_wplug(disk, sector);
653 if (zwplug)
654 return zwplug;
655
656 /*
657 * Allocate and initialize a zone write plug with an extra reference
658 * so that it is not freed when the zone write plug becomes idle without
659 * the zone being full.
660 */
661 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
662 if (!zwplug)
663 return NULL;
664
665 INIT_HLIST_NODE(&zwplug->node);
666 refcount_set(&zwplug->ref, 2);
667 spin_lock_init(&zwplug->lock);
668 zwplug->flags = 0;
669 zwplug->zone_no = zno;
670 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
671 bio_list_init(&zwplug->bio_list);
672 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
673 INIT_LIST_HEAD(&zwplug->entry);
674 zwplug->disk = disk;
675
676 /*
677 * Insert the new zone write plug in the hash table. This can fail only
678 * if another context already inserted a plug. Retry from the beginning
679 * in such case.
680 */
681 if (!disk_insert_zone_wplug(disk, zwplug)) {
682 mempool_free(zwplug, disk->zone_wplugs_pool);
683 goto again;
684 }
685
686 return zwplug;
687 }
688
blk_zone_wplug_bio_io_error(struct blk_zone_wplug * zwplug,struct bio * bio)689 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
690 struct bio *bio)
691 {
692 struct request_queue *q = zwplug->disk->queue;
693
694 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
695 bio_io_error(bio);
696 disk_put_zone_wplug(zwplug);
697 /* Drop the reference taken by disk_zone_wplug_add_bio(). */
698 blk_queue_exit(q);
699 }
700
701 /*
702 * Abort (fail) all plugged BIOs of a zone write plug.
703 */
disk_zone_wplug_abort(struct blk_zone_wplug * zwplug)704 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
705 {
706 struct gendisk *disk = zwplug->disk;
707 struct bio *bio;
708
709 lockdep_assert_held(&zwplug->lock);
710
711 if (bio_list_empty(&zwplug->bio_list))
712 return;
713
714 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
715 zwplug->disk->disk_name, zwplug->zone_no);
716 while ((bio = bio_list_pop(&zwplug->bio_list)))
717 blk_zone_wplug_bio_io_error(zwplug, bio);
718
719 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
720
721 /*
722 * If we are using the per disk zone write plugs worker thread, remove
723 * the zone write plug from the work list and drop the reference we
724 * took when the zone write plug was added to that list.
725 */
726 if (blk_queue_zoned_qd1_writes(disk->queue)) {
727 spin_lock(&disk->zone_wplugs_list_lock);
728 if (!list_empty(&zwplug->entry)) {
729 list_del_init(&zwplug->entry);
730 disk_put_zone_wplug(zwplug);
731 }
732 spin_unlock(&disk->zone_wplugs_list_lock);
733 }
734 }
735
736 /*
737 * Update a zone write plug condition based on the write pointer offset.
738 */
disk_zone_wplug_update_cond(struct gendisk * disk,struct blk_zone_wplug * zwplug)739 static void disk_zone_wplug_update_cond(struct gendisk *disk,
740 struct blk_zone_wplug *zwplug)
741 {
742 lockdep_assert_held(&zwplug->lock);
743
744 if (disk_zone_wplug_is_full(disk, zwplug))
745 zwplug->cond = BLK_ZONE_COND_FULL;
746 else if (!zwplug->wp_offset)
747 zwplug->cond = BLK_ZONE_COND_EMPTY;
748 else
749 zwplug->cond = BLK_ZONE_COND_ACTIVE;
750 }
751
752 /*
753 * Set a zone write plug write pointer offset to the specified value.
754 * This aborts all plugged BIOs, which is fine as this function is called for
755 * a zone reset operation, a zone finish operation or if the zone needs a wp
756 * update from a report zone after a write error.
757 */
disk_zone_wplug_set_wp_offset(struct gendisk * disk,struct blk_zone_wplug * zwplug,unsigned int wp_offset)758 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
759 struct blk_zone_wplug *zwplug,
760 unsigned int wp_offset)
761 {
762 lockdep_assert_held(&zwplug->lock);
763
764 /* Update the zone write pointer and abort all plugged BIOs. */
765 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
766 zwplug->wp_offset = wp_offset;
767 disk_zone_wplug_update_cond(disk, zwplug);
768
769 disk_zone_wplug_abort(zwplug);
770 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
771 disk_mark_zone_wplug_dead(zwplug);
772 }
773
blk_zone_wp_offset(struct blk_zone * zone)774 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
775 {
776 switch (zone->cond) {
777 case BLK_ZONE_COND_IMP_OPEN:
778 case BLK_ZONE_COND_EXP_OPEN:
779 case BLK_ZONE_COND_CLOSED:
780 case BLK_ZONE_COND_ACTIVE:
781 return zone->wp - zone->start;
782 case BLK_ZONE_COND_EMPTY:
783 return 0;
784 case BLK_ZONE_COND_FULL:
785 case BLK_ZONE_COND_NOT_WP:
786 case BLK_ZONE_COND_OFFLINE:
787 case BLK_ZONE_COND_READONLY:
788 default:
789 /*
790 * Conventional, full, offline and read-only zones do not have
791 * a valid write pointer.
792 */
793 return UINT_MAX;
794 }
795 }
796
disk_zone_wplug_sync_wp_offset(struct gendisk * disk,struct blk_zone * zone)797 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
798 struct blk_zone *zone)
799 {
800 struct blk_zone_wplug *zwplug;
801 unsigned int wp_offset = blk_zone_wp_offset(zone);
802
803 zwplug = disk_get_zone_wplug(disk, zone->start);
804 if (zwplug) {
805 unsigned long flags;
806
807 spin_lock_irqsave(&zwplug->lock, flags);
808 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
809 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
810 spin_unlock_irqrestore(&zwplug->lock, flags);
811 disk_put_zone_wplug(zwplug);
812 }
813
814 return wp_offset;
815 }
816
817 /**
818 * disk_report_zone - Report one zone
819 * @disk: Target disk
820 * @zone: The zone to report
821 * @idx: The index of the zone in the overall zone report
822 * @args: report zones callback and data
823 *
824 * Description:
825 * Helper function for block device drivers to report one zone of a zone
826 * report initiated with blkdev_report_zones(). The zone being reported is
827 * specified by @zone and used to update, if necessary, the zone write plug
828 * information for the zone. If @args specifies a user callback function,
829 * this callback is executed.
830 */
disk_report_zone(struct gendisk * disk,struct blk_zone * zone,unsigned int idx,struct blk_report_zones_args * args)831 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
832 unsigned int idx, struct blk_report_zones_args *args)
833 {
834 if (args && args->report_active) {
835 /*
836 * If we come here, then this is a report zones as a fallback
837 * for a cached report. So collapse the implicit open, explicit
838 * open and closed conditions into the active zone condition.
839 */
840 switch (zone->cond) {
841 case BLK_ZONE_COND_IMP_OPEN:
842 case BLK_ZONE_COND_EXP_OPEN:
843 case BLK_ZONE_COND_CLOSED:
844 zone->cond = BLK_ZONE_COND_ACTIVE;
845 break;
846 default:
847 break;
848 }
849 }
850
851 if (disk->zone_wplugs_hash)
852 disk_zone_wplug_sync_wp_offset(disk, zone);
853
854 if (args && args->cb)
855 return args->cb(zone, idx, args->data);
856
857 return 0;
858 }
859 EXPORT_SYMBOL_GPL(disk_report_zone);
860
blkdev_report_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)861 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
862 void *data)
863 {
864 memcpy(data, zone, sizeof(struct blk_zone));
865 return 0;
866 }
867
blkdev_report_zone_fallback(struct block_device * bdev,sector_t sector,struct blk_zone * zone)868 static int blkdev_report_zone_fallback(struct block_device *bdev,
869 sector_t sector, struct blk_zone *zone)
870 {
871 struct blk_report_zones_args args = {
872 .cb = blkdev_report_zone_cb,
873 .data = zone,
874 .report_active = true,
875 };
876 int error;
877
878 error = blkdev_do_report_zones(bdev, sector, 1, &args);
879 if (error < 0)
880 return error;
881 if (error == 0)
882 return -EIO;
883 return 0;
884 }
885
886 /*
887 * For devices that natively support zone append operations, we do not use zone
888 * write plugging for zone append writes, which makes the zone condition
889 * tracking invalid once zone append was used. In that case fall back to a
890 * regular report zones to get correct information.
891 */
blkdev_has_cached_report_zones(struct block_device * bdev)892 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
893 {
894 return disk_need_zone_resources(bdev->bd_disk) &&
895 (bdev_emulates_zone_append(bdev) ||
896 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
897 }
898
899 /**
900 * blkdev_get_zone_info - Get a single zone information from cached data
901 * @bdev: Target block device
902 * @sector: Sector contained by the target zone
903 * @zone: zone structure to return the zone information
904 *
905 * Description:
906 * Get the zone information for the zone containing @sector using the zone
907 * write plug of the target zone, if one exist, or the disk zone condition
908 * array otherwise. The zone condition may be reported as being
909 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
910 * open, explicit open or closed condition.
911 *
912 * Returns 0 on success and a negative error code on failure.
913 */
blkdev_get_zone_info(struct block_device * bdev,sector_t sector,struct blk_zone * zone)914 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
915 struct blk_zone *zone)
916 {
917 struct gendisk *disk = bdev->bd_disk;
918 sector_t zone_sectors = bdev_zone_sectors(bdev);
919 struct blk_zone_wplug *zwplug;
920 unsigned long flags;
921 u8 *zones_cond;
922
923 if (!bdev_is_zoned(bdev))
924 return -EOPNOTSUPP;
925
926 if (sector >= get_capacity(disk))
927 return -EINVAL;
928
929 memset(zone, 0, sizeof(*zone));
930 sector = bdev_zone_start(bdev, sector);
931
932 if (!blkdev_has_cached_report_zones(bdev))
933 return blkdev_report_zone_fallback(bdev, sector, zone);
934
935 rcu_read_lock();
936 zones_cond = rcu_dereference(disk->zones_cond);
937 if (!disk->zone_wplugs_hash || !zones_cond) {
938 rcu_read_unlock();
939 return blkdev_report_zone_fallback(bdev, sector, zone);
940 }
941 zone->cond = zones_cond[disk_zone_no(disk, sector)];
942 rcu_read_unlock();
943
944 zone->start = sector;
945 zone->len = zone_sectors;
946
947 /*
948 * If this is a conventional zone, we do not have a zone write plug and
949 * can report the zone immediately.
950 */
951 if (zone->cond == BLK_ZONE_COND_NOT_WP) {
952 zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
953 zone->capacity = zone_sectors;
954 zone->wp = ULLONG_MAX;
955 return 0;
956 }
957
958 /*
959 * This is a sequential write required zone. If the zone is read-only or
960 * offline, only set the zone write pointer to an invalid value and
961 * report the zone.
962 */
963 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
964 if (disk_zone_is_last(disk, zone))
965 zone->capacity = disk->last_zone_capacity;
966 else
967 zone->capacity = disk->zone_capacity;
968
969 if (zone->cond == BLK_ZONE_COND_READONLY ||
970 zone->cond == BLK_ZONE_COND_OFFLINE) {
971 zone->wp = ULLONG_MAX;
972 return 0;
973 }
974
975 /*
976 * If the zone does not have a zone write plug, it is either full or
977 * empty, as we otherwise would have a zone write plug for it. In this
978 * case, set the write pointer accordingly and report the zone.
979 * Otherwise, if we have a zone write plug, use it.
980 */
981 zwplug = disk_get_zone_wplug(disk, sector);
982 if (!zwplug) {
983 if (zone->cond == BLK_ZONE_COND_FULL)
984 zone->wp = ULLONG_MAX;
985 else
986 zone->wp = sector;
987 return 0;
988 }
989
990 spin_lock_irqsave(&zwplug->lock, flags);
991 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
992 spin_unlock_irqrestore(&zwplug->lock, flags);
993 disk_put_zone_wplug(zwplug);
994 return blkdev_report_zone_fallback(bdev, sector, zone);
995 }
996 zone->cond = zwplug->cond;
997 zone->wp = sector + zwplug->wp_offset;
998 spin_unlock_irqrestore(&zwplug->lock, flags);
999
1000 disk_put_zone_wplug(zwplug);
1001
1002 return 0;
1003 }
1004 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1005
1006 /**
1007 * blkdev_report_zones_cached - Get cached zones information
1008 * @bdev: Target block device
1009 * @sector: Sector from which to report zones
1010 * @nr_zones: Maximum number of zones to report
1011 * @cb: Callback function called for each reported zone
1012 * @data: Private data for the callback function
1013 *
1014 * Description:
1015 * Similar to blkdev_report_zones() but instead of calling into the low level
1016 * device driver to get the zone report from the device, use
1017 * blkdev_get_zone_info() to generate the report from the disk zone write
1018 * plugs and zones condition array. Since calling this function without a
1019 * callback does not make sense, @cb must be specified.
1020 */
blkdev_report_zones_cached(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)1021 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1022 unsigned int nr_zones, report_zones_cb cb, void *data)
1023 {
1024 struct gendisk *disk = bdev->bd_disk;
1025 sector_t capacity = get_capacity(disk);
1026 sector_t zone_sectors = bdev_zone_sectors(bdev);
1027 unsigned int idx = 0;
1028 struct blk_zone zone;
1029 int ret;
1030
1031 if (!cb || !bdev_is_zoned(bdev) ||
1032 WARN_ON_ONCE(!disk->fops->report_zones))
1033 return -EOPNOTSUPP;
1034
1035 if (!nr_zones || sector >= capacity)
1036 return 0;
1037
1038 if (!blkdev_has_cached_report_zones(bdev)) {
1039 struct blk_report_zones_args args = {
1040 .cb = cb,
1041 .data = data,
1042 .report_active = true,
1043 };
1044
1045 return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1046 }
1047
1048 for (sector = bdev_zone_start(bdev, sector);
1049 sector < capacity && idx < nr_zones;
1050 sector += zone_sectors, idx++) {
1051 ret = blkdev_get_zone_info(bdev, sector, &zone);
1052 if (ret)
1053 return ret;
1054
1055 ret = cb(&zone, idx, data);
1056 if (ret)
1057 return ret;
1058 }
1059
1060 return idx;
1061 }
1062 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1063
blk_zone_reset_bio_endio(struct bio * bio)1064 static void blk_zone_reset_bio_endio(struct bio *bio)
1065 {
1066 struct gendisk *disk = bio->bi_bdev->bd_disk;
1067 sector_t sector = bio->bi_iter.bi_sector;
1068 struct blk_zone_wplug *zwplug;
1069
1070 /*
1071 * If we have a zone write plug, set its write pointer offset to 0.
1072 * This will abort all BIOs plugged for the target zone. It is fine as
1073 * resetting zones while writes are still in-flight will result in the
1074 * writes failing anyway.
1075 */
1076 zwplug = disk_get_zone_wplug(disk, sector);
1077 if (zwplug) {
1078 unsigned long flags;
1079
1080 spin_lock_irqsave(&zwplug->lock, flags);
1081 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1082 spin_unlock_irqrestore(&zwplug->lock, flags);
1083 disk_put_zone_wplug(zwplug);
1084 } else {
1085 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1086 }
1087 }
1088
blk_zone_reset_all_bio_endio(struct bio * bio)1089 static void blk_zone_reset_all_bio_endio(struct bio *bio)
1090 {
1091 struct gendisk *disk = bio->bi_bdev->bd_disk;
1092 sector_t capacity = get_capacity(disk);
1093 struct blk_zone_wplug *zwplug;
1094 unsigned long flags;
1095 sector_t sector;
1096 unsigned int i;
1097
1098 if (atomic_read(&disk->nr_zone_wplugs)) {
1099 /* Update the condition of all zone write plugs. */
1100 rcu_read_lock();
1101 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1102 hlist_for_each_entry_rcu(zwplug,
1103 &disk->zone_wplugs_hash[i],
1104 node) {
1105 spin_lock_irqsave(&zwplug->lock, flags);
1106 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1107 spin_unlock_irqrestore(&zwplug->lock, flags);
1108 }
1109 }
1110 rcu_read_unlock();
1111 }
1112
1113 /* Update the cached zone conditions. */
1114 for (sector = 0; sector < capacity;
1115 sector += bdev_zone_sectors(bio->bi_bdev))
1116 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1117 clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1118 }
1119
blk_zone_finish_bio_endio(struct bio * bio)1120 static void blk_zone_finish_bio_endio(struct bio *bio)
1121 {
1122 struct block_device *bdev = bio->bi_bdev;
1123 struct gendisk *disk = bdev->bd_disk;
1124 sector_t sector = bio->bi_iter.bi_sector;
1125 struct blk_zone_wplug *zwplug;
1126
1127 /*
1128 * If we have a zone write plug, set its write pointer offset to the
1129 * zone size. This will abort all BIOs plugged for the target zone. It
1130 * is fine as resetting zones while writes are still in-flight will
1131 * result in the writes failing anyway.
1132 */
1133 zwplug = disk_get_zone_wplug(disk, sector);
1134 if (zwplug) {
1135 unsigned long flags;
1136
1137 spin_lock_irqsave(&zwplug->lock, flags);
1138 disk_zone_wplug_set_wp_offset(disk, zwplug,
1139 bdev_zone_sectors(bdev));
1140 spin_unlock_irqrestore(&zwplug->lock, flags);
1141 disk_put_zone_wplug(zwplug);
1142 } else {
1143 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1144 }
1145 }
1146
blk_zone_mgmt_bio_endio(struct bio * bio)1147 void blk_zone_mgmt_bio_endio(struct bio *bio)
1148 {
1149 /* If the BIO failed, we have nothing to do. */
1150 if (bio->bi_status != BLK_STS_OK)
1151 return;
1152
1153 switch (bio_op(bio)) {
1154 case REQ_OP_ZONE_RESET:
1155 blk_zone_reset_bio_endio(bio);
1156 return;
1157 case REQ_OP_ZONE_RESET_ALL:
1158 blk_zone_reset_all_bio_endio(bio);
1159 return;
1160 case REQ_OP_ZONE_FINISH:
1161 blk_zone_finish_bio_endio(bio);
1162 return;
1163 default:
1164 return;
1165 }
1166 }
1167
disk_zone_wplug_schedule_work(struct gendisk * disk,struct blk_zone_wplug * zwplug)1168 static void disk_zone_wplug_schedule_work(struct gendisk *disk,
1169 struct blk_zone_wplug *zwplug)
1170 {
1171 lockdep_assert_held(&zwplug->lock);
1172
1173 /*
1174 * Schedule the submission of the next plugged BIO. Taking a reference
1175 * to the zone write plug is required as the bio_work belongs to the
1176 * plug, and thus we must ensure that the write plug does not go away
1177 * while the work is being scheduled but has not run yet.
1178 * blk_zone_wplug_bio_work() will release the reference we take here,
1179 * and we also drop this reference if the work is already scheduled.
1180 */
1181 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1182 WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
1183 refcount_inc(&zwplug->ref);
1184 if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
1185 disk_put_zone_wplug(zwplug);
1186 }
1187
disk_zone_wplug_add_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug,struct bio * bio,unsigned int nr_segs)1188 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1189 struct blk_zone_wplug *zwplug,
1190 struct bio *bio, unsigned int nr_segs)
1191 {
1192 /*
1193 * Grab an extra reference on the BIO request queue usage counter.
1194 * This reference will be reused to submit a request for the BIO for
1195 * blk-mq devices and dropped when the BIO is failed and after
1196 * it is issued in the case of BIO-based devices.
1197 */
1198 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1199
1200 /*
1201 * The BIO is being plugged and thus will have to wait for the on-going
1202 * write and for all other writes already plugged. So polling makes
1203 * no sense.
1204 */
1205 bio_clear_polled(bio);
1206
1207 /*
1208 * Reuse the poll cookie field to store the number of segments when
1209 * split to the hardware limits.
1210 */
1211 bio->__bi_nr_segments = nr_segs;
1212
1213 /*
1214 * We always receive BIOs after they are split and ready to be issued.
1215 * The block layer passes the parts of a split BIO in order, and the
1216 * user must also issue write sequentially. So simply add the new BIO
1217 * at the tail of the list to preserve the sequential write order.
1218 */
1219 bio_list_add(&zwplug->bio_list, bio);
1220 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1221 bio->bi_iter.bi_sector, bio_sectors(bio));
1222
1223 /*
1224 * If we are using the disk zone write plugs worker instead of the per
1225 * zone write plug BIO work, add the zone write plug to the work list
1226 * if it is not already there. Make sure to also get an extra reference
1227 * on the zone write plug so that it does not go away until it is
1228 * removed from the work list.
1229 */
1230 if (blk_queue_zoned_qd1_writes(disk->queue)) {
1231 spin_lock(&disk->zone_wplugs_list_lock);
1232 if (list_empty(&zwplug->entry)) {
1233 list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
1234 refcount_inc(&zwplug->ref);
1235 }
1236 spin_unlock(&disk->zone_wplugs_list_lock);
1237 }
1238 }
1239
1240 /*
1241 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1242 */
blk_zone_write_plug_bio_merged(struct bio * bio)1243 void blk_zone_write_plug_bio_merged(struct bio *bio)
1244 {
1245 struct gendisk *disk = bio->bi_bdev->bd_disk;
1246 struct blk_zone_wplug *zwplug;
1247 unsigned long flags;
1248
1249 /*
1250 * If the BIO was already plugged, then we were called through
1251 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1252 * For this case, we already hold a reference on the zone write plug for
1253 * the BIO and blk_zone_write_plug_init_request() will handle the
1254 * zone write pointer offset update.
1255 */
1256 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1257 return;
1258
1259 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1260
1261 /*
1262 * Get a reference on the zone write plug of the target zone and advance
1263 * the zone write pointer offset. Given that this is a merge, we already
1264 * have at least one request and one BIO referencing the zone write
1265 * plug. So this should not fail.
1266 */
1267 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1268 if (WARN_ON_ONCE(!zwplug))
1269 return;
1270
1271 spin_lock_irqsave(&zwplug->lock, flags);
1272 zwplug->wp_offset += bio_sectors(bio);
1273 disk_zone_wplug_update_cond(disk, zwplug);
1274 spin_unlock_irqrestore(&zwplug->lock, flags);
1275 }
1276
1277 /*
1278 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1279 * already went through zone write plugging (either a new BIO or one that was
1280 * unplugged).
1281 */
blk_zone_write_plug_init_request(struct request * req)1282 void blk_zone_write_plug_init_request(struct request *req)
1283 {
1284 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1285 struct request_queue *q = req->q;
1286 struct gendisk *disk = q->disk;
1287 struct blk_zone_wplug *zwplug =
1288 disk_get_zone_wplug(disk, blk_rq_pos(req));
1289 unsigned long flags;
1290 struct bio *bio;
1291
1292 if (WARN_ON_ONCE(!zwplug))
1293 return;
1294
1295 /*
1296 * Indicate that completion of this request needs to be handled with
1297 * blk_zone_write_plug_finish_request(), which will drop the reference
1298 * on the zone write plug we took above on entry to this function.
1299 */
1300 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1301
1302 if (blk_queue_nomerges(q))
1303 return;
1304
1305 /*
1306 * Walk through the list of plugged BIOs to check if they can be merged
1307 * into the back of the request.
1308 */
1309 spin_lock_irqsave(&zwplug->lock, flags);
1310 while (!disk_zone_wplug_is_full(disk, zwplug)) {
1311 bio = bio_list_peek(&zwplug->bio_list);
1312 if (!bio)
1313 break;
1314
1315 if (bio->bi_iter.bi_sector != req_back_sector ||
1316 !blk_rq_merge_ok(req, bio))
1317 break;
1318
1319 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1320 !bio->__bi_nr_segments);
1321
1322 bio_list_pop(&zwplug->bio_list);
1323 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1324 BIO_MERGE_OK) {
1325 bio_list_add_head(&zwplug->bio_list, bio);
1326 break;
1327 }
1328
1329 /* Drop the reference taken by disk_zone_wplug_add_bio(). */
1330 blk_queue_exit(q);
1331 zwplug->wp_offset += bio_sectors(bio);
1332 disk_zone_wplug_update_cond(disk, zwplug);
1333
1334 req_back_sector += bio_sectors(bio);
1335 }
1336 spin_unlock_irqrestore(&zwplug->lock, flags);
1337 }
1338
1339 /*
1340 * Check and prepare a BIO for submission by incrementing the write pointer
1341 * offset of its zone write plug and changing zone append operations into
1342 * regular write when zone append emulation is needed.
1343 */
blk_zone_wplug_prepare_bio(struct blk_zone_wplug * zwplug,struct bio * bio)1344 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1345 struct bio *bio)
1346 {
1347 struct gendisk *disk = bio->bi_bdev->bd_disk;
1348
1349 lockdep_assert_held(&zwplug->lock);
1350
1351 /*
1352 * If we lost track of the zone write pointer due to a write error,
1353 * the user must either execute a report zones, reset the zone or finish
1354 * the to recover a reliable write pointer position. Fail BIOs if the
1355 * user did not do that as we cannot handle emulated zone append
1356 * otherwise.
1357 */
1358 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1359 return false;
1360
1361 /*
1362 * Check that the user is not attempting to write to a full zone.
1363 * We know such BIO will fail, and that would potentially overflow our
1364 * write pointer offset beyond the end of the zone.
1365 */
1366 if (disk_zone_wplug_is_full(disk, zwplug))
1367 return false;
1368
1369 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1370 /*
1371 * Use a regular write starting at the current write pointer.
1372 * Similarly to native zone append operations, do not allow
1373 * merging.
1374 */
1375 bio->bi_opf &= ~REQ_OP_MASK;
1376 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1377 bio->bi_iter.bi_sector += zwplug->wp_offset;
1378
1379 /*
1380 * Remember that this BIO is in fact a zone append operation
1381 * so that we can restore its operation code on completion.
1382 */
1383 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1384 } else {
1385 /*
1386 * Check for non-sequential writes early as we know that BIOs
1387 * with a start sector not unaligned to the zone write pointer
1388 * will fail.
1389 */
1390 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1391 return false;
1392 }
1393
1394 /* Advance the zone write pointer offset. */
1395 zwplug->wp_offset += bio_sectors(bio);
1396 disk_zone_wplug_update_cond(disk, zwplug);
1397
1398 return true;
1399 }
1400
blk_zone_wplug_handle_write(struct bio * bio,unsigned int nr_segs)1401 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1402 {
1403 struct gendisk *disk = bio->bi_bdev->bd_disk;
1404 sector_t sector = bio->bi_iter.bi_sector;
1405 struct blk_zone_wplug *zwplug;
1406 gfp_t gfp_mask = GFP_NOIO;
1407 unsigned long flags;
1408
1409 /*
1410 * BIOs must be fully contained within a zone so that we use the correct
1411 * zone write plug for the entire BIO. For blk-mq devices, the block
1412 * layer should already have done any splitting required to ensure this
1413 * and this BIO should thus not be straddling zone boundaries. For
1414 * BIO-based devices, it is the responsibility of the driver to split
1415 * the bio before submitting it.
1416 */
1417 if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1418 bio_io_error(bio);
1419 return true;
1420 }
1421
1422 /* Conventional zones do not need write plugging. */
1423 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1424 /* Zone append to conventional zones is not allowed. */
1425 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1426 bio_io_error(bio);
1427 return true;
1428 }
1429 return false;
1430 }
1431
1432 if (bio->bi_opf & REQ_NOWAIT)
1433 gfp_mask = GFP_NOWAIT;
1434
1435 zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
1436 if (!zwplug) {
1437 if (bio->bi_opf & REQ_NOWAIT)
1438 bio_wouldblock_error(bio);
1439 else
1440 bio_io_error(bio);
1441 return true;
1442 }
1443
1444 spin_lock_irqsave(&zwplug->lock, flags);
1445
1446 /*
1447 * If we got a zone write plug marked as dead, then the user is issuing
1448 * writes to a full zone, or without synchronizing with zone reset or
1449 * zone finish operations. In such case, fail the BIO to signal this
1450 * invalid usage.
1451 */
1452 if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
1453 spin_unlock_irqrestore(&zwplug->lock, flags);
1454 disk_put_zone_wplug(zwplug);
1455 bio_io_error(bio);
1456 return true;
1457 }
1458
1459 /* Indicate that this BIO is being handled using zone write plugging. */
1460 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1461
1462 /*
1463 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1464 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1465 */
1466 if (bio->bi_opf & REQ_NOWAIT) {
1467 bio->bi_opf &= ~REQ_NOWAIT;
1468 goto queue_bio;
1469 }
1470
1471 /*
1472 * For rotational devices, we will use the gendisk zone write plugs
1473 * work instead of the per zone write plug BIO work, so queue the BIO.
1474 */
1475 if (blk_queue_zoned_qd1_writes(disk->queue))
1476 goto queue_bio;
1477
1478 /* If the zone is already plugged, add the BIO to the BIO plug list. */
1479 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1480 goto queue_bio;
1481
1482 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1483 spin_unlock_irqrestore(&zwplug->lock, flags);
1484 bio_io_error(bio);
1485 return true;
1486 }
1487
1488 /* Otherwise, plug and let the caller submit the BIO. */
1489 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1490
1491 spin_unlock_irqrestore(&zwplug->lock, flags);
1492
1493 return false;
1494
1495 queue_bio:
1496 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1497
1498 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1499 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1500 if (blk_queue_zoned_qd1_writes(disk->queue))
1501 wake_up_process(disk->zone_wplugs_worker);
1502 else
1503 disk_zone_wplug_schedule_work(disk, zwplug);
1504 }
1505
1506 spin_unlock_irqrestore(&zwplug->lock, flags);
1507
1508 return true;
1509 }
1510
blk_zone_wplug_handle_native_zone_append(struct bio * bio)1511 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1512 {
1513 struct gendisk *disk = bio->bi_bdev->bd_disk;
1514 struct blk_zone_wplug *zwplug;
1515 unsigned long flags;
1516
1517 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1518 set_bit(GD_ZONE_APPEND_USED, &disk->state);
1519
1520 /*
1521 * We have native support for zone append operations, so we are not
1522 * going to handle @bio through plugging. However, we may already have a
1523 * zone write plug for the target zone if that zone was previously
1524 * partially written using regular writes. In such case, we risk leaving
1525 * the plug in the disk hash table if the zone is fully written using
1526 * zone append operations. Avoid this by removing the zone write plug.
1527 */
1528 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1529 if (likely(!zwplug))
1530 return;
1531
1532 spin_lock_irqsave(&zwplug->lock, flags);
1533
1534 /*
1535 * We are about to remove the zone write plug. But if the user
1536 * (mistakenly) has issued regular writes together with native zone
1537 * append, we must aborts the writes as otherwise the plugged BIOs would
1538 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1539 * return NULL after the plug is removed. Aborting the plugged write
1540 * BIOs is consistent with the fact that these writes will most likely
1541 * fail anyway as there is no ordering guarantees between zone append
1542 * operations and regular write operations.
1543 */
1544 if (!bio_list_empty(&zwplug->bio_list)) {
1545 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1546 disk->disk_name, zwplug->zone_no);
1547 disk_zone_wplug_abort(zwplug);
1548 }
1549 disk_mark_zone_wplug_dead(zwplug);
1550 spin_unlock_irqrestore(&zwplug->lock, flags);
1551
1552 disk_put_zone_wplug(zwplug);
1553 }
1554
blk_zone_wplug_handle_zone_mgmt(struct bio * bio)1555 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1556 {
1557 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1558 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1559 /*
1560 * Zone reset and zone finish operations do not apply to
1561 * conventional zones.
1562 */
1563 bio_io_error(bio);
1564 return true;
1565 }
1566
1567 /*
1568 * No-wait zone management BIOs do not make much sense as the callers
1569 * issue these as blocking operations in most cases. To avoid issues
1570 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1571 * about REQ_NOWAIT being set and ignore that flag.
1572 */
1573 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1574 bio->bi_opf &= ~REQ_NOWAIT;
1575
1576 return false;
1577 }
1578
1579 /**
1580 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1581 * @bio: The BIO being submitted
1582 * @nr_segs: The number of physical segments of @bio
1583 *
1584 * Handle write, write zeroes and zone append operations requiring emulation
1585 * using zone write plugging.
1586 *
1587 * Return true whenever @bio execution needs to be delayed through the zone
1588 * write plug. Otherwise, return false to let the submission path process
1589 * @bio normally.
1590 */
blk_zone_plug_bio(struct bio * bio,unsigned int nr_segs)1591 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1592 {
1593 struct block_device *bdev = bio->bi_bdev;
1594
1595 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1596 return false;
1597
1598 /*
1599 * Regular writes and write zeroes need to be handled through the target
1600 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1601 * which may need to go through the flush machinery depending on the
1602 * target device capabilities. Plugging such writes is fine as the flush
1603 * machinery operates at the request level, below the plug, and
1604 * completion of the flush sequence will go through the regular BIO
1605 * completion, which will handle zone write plugging.
1606 * Zone append operations for devices that requested emulation must
1607 * also be plugged so that these BIOs can be changed into regular
1608 * write BIOs.
1609 * Zone reset, reset all and finish commands need special treatment
1610 * to correctly track the write pointer offset of zones. These commands
1611 * are not plugged as we do not need serialization with write
1612 * operations. It is the responsibility of the user to not issue reset
1613 * and finish commands when write operations are in flight.
1614 */
1615 switch (bio_op(bio)) {
1616 case REQ_OP_ZONE_APPEND:
1617 if (!bdev_emulates_zone_append(bdev)) {
1618 blk_zone_wplug_handle_native_zone_append(bio);
1619 return false;
1620 }
1621 fallthrough;
1622 case REQ_OP_WRITE:
1623 case REQ_OP_WRITE_ZEROES:
1624 return blk_zone_wplug_handle_write(bio, nr_segs);
1625 case REQ_OP_ZONE_RESET:
1626 case REQ_OP_ZONE_FINISH:
1627 case REQ_OP_ZONE_RESET_ALL:
1628 return blk_zone_wplug_handle_zone_mgmt(bio);
1629 default:
1630 return false;
1631 }
1632
1633 return false;
1634 }
1635 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1636
disk_zone_wplug_unplug_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1637 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1638 struct blk_zone_wplug *zwplug)
1639 {
1640 unsigned long flags;
1641
1642 spin_lock_irqsave(&zwplug->lock, flags);
1643
1644 /*
1645 * For rotational devices, signal the BIO completion to the zone write
1646 * plug work. Otherwise, schedule submission of the next plugged BIO
1647 * if we have one.
1648 */
1649 if (bio_list_empty(&zwplug->bio_list))
1650 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1651
1652 if (blk_queue_zoned_qd1_writes(disk->queue))
1653 complete(&disk->zone_wplugs_worker_bio_done);
1654 else if (!bio_list_empty(&zwplug->bio_list))
1655 disk_zone_wplug_schedule_work(disk, zwplug);
1656
1657 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
1658 disk_mark_zone_wplug_dead(zwplug);
1659
1660 spin_unlock_irqrestore(&zwplug->lock, flags);
1661 }
1662
blk_zone_append_update_request_bio(struct request * rq,struct bio * bio)1663 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1664 {
1665 /*
1666 * For zone append requests, the request sector indicates the location
1667 * at which the BIO data was written. Return this value to the BIO
1668 * issuer through the BIO iter sector.
1669 * For plugged zone writes, which include emulated zone append, we need
1670 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1671 * lookup the zone write plug.
1672 */
1673 bio->bi_iter.bi_sector = rq->__sector;
1674 trace_blk_zone_append_update_request_bio(rq);
1675 }
1676
blk_zone_write_plug_bio_endio(struct bio * bio)1677 void blk_zone_write_plug_bio_endio(struct bio *bio)
1678 {
1679 struct gendisk *disk = bio->bi_bdev->bd_disk;
1680 struct blk_zone_wplug *zwplug =
1681 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1682 unsigned long flags;
1683
1684 if (WARN_ON_ONCE(!zwplug))
1685 return;
1686
1687 /* Make sure we do not see this BIO again by clearing the plug flag. */
1688 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1689
1690 /*
1691 * If this is a regular write emulating a zone append operation,
1692 * restore the original operation code.
1693 */
1694 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1695 bio->bi_opf &= ~REQ_OP_MASK;
1696 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1697 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1698 }
1699
1700 /*
1701 * If the BIO failed, abort all plugged BIOs and mark the plug as
1702 * needing a write pointer update.
1703 */
1704 if (bio->bi_status != BLK_STS_OK) {
1705 spin_lock_irqsave(&zwplug->lock, flags);
1706 disk_zone_wplug_abort(zwplug);
1707 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1708 spin_unlock_irqrestore(&zwplug->lock, flags);
1709 }
1710
1711 /* Drop the reference we took when the BIO was issued. */
1712 disk_put_zone_wplug(zwplug);
1713
1714 /*
1715 * For BIO-based devices, blk_zone_write_plug_finish_request()
1716 * is not called. So we need to schedule execution of the next
1717 * plugged BIO here.
1718 */
1719 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1720 disk_zone_wplug_unplug_bio(disk, zwplug);
1721
1722 /* Drop the reference we took when entering this function. */
1723 disk_put_zone_wplug(zwplug);
1724 }
1725
blk_zone_write_plug_finish_request(struct request * req)1726 void blk_zone_write_plug_finish_request(struct request *req)
1727 {
1728 struct gendisk *disk = req->q->disk;
1729 struct blk_zone_wplug *zwplug;
1730
1731 zwplug = disk_get_zone_wplug(disk, req->__sector);
1732 if (WARN_ON_ONCE(!zwplug))
1733 return;
1734
1735 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1736
1737 /*
1738 * Drop the reference we took when the request was initialized in
1739 * blk_zone_write_plug_init_request().
1740 */
1741 disk_put_zone_wplug(zwplug);
1742
1743 disk_zone_wplug_unplug_bio(disk, zwplug);
1744
1745 /* Drop the reference we took when entering this function. */
1746 disk_put_zone_wplug(zwplug);
1747 }
1748
disk_zone_wplug_submit_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1749 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
1750 struct blk_zone_wplug *zwplug)
1751 {
1752 struct block_device *bdev;
1753 unsigned long flags;
1754 struct bio *bio;
1755 bool prepared;
1756
1757 /*
1758 * Submit the next plugged BIO. If we do not have any, clear
1759 * the plugged flag.
1760 */
1761 again:
1762 spin_lock_irqsave(&zwplug->lock, flags);
1763 bio = bio_list_pop(&zwplug->bio_list);
1764 if (!bio) {
1765 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1766 spin_unlock_irqrestore(&zwplug->lock, flags);
1767 return false;
1768 }
1769
1770 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1771 bio->bi_iter.bi_sector, bio_sectors(bio));
1772
1773 prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1774 spin_unlock_irqrestore(&zwplug->lock, flags);
1775
1776 if (!prepared) {
1777 blk_zone_wplug_bio_io_error(zwplug, bio);
1778 goto again;
1779 }
1780
1781 /*
1782 * blk-mq devices will reuse the extra reference on the request queue
1783 * usage counter we took when the BIO was plugged, but the submission
1784 * path for BIO-based devices will not do that. So drop this extra
1785 * reference here.
1786 */
1787 if (blk_queue_zoned_qd1_writes(disk->queue))
1788 reinit_completion(&disk->zone_wplugs_worker_bio_done);
1789 bdev = bio->bi_bdev;
1790 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1791 bdev->bd_disk->fops->submit_bio(bio);
1792 blk_queue_exit(bdev->bd_disk->queue);
1793 } else {
1794 blk_mq_submit_bio(bio);
1795 }
1796
1797 return true;
1798 }
1799
disk_get_zone_wplugs_work(struct gendisk * disk)1800 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
1801 {
1802 struct blk_zone_wplug *zwplug;
1803
1804 spin_lock_irq(&disk->zone_wplugs_list_lock);
1805 zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
1806 struct blk_zone_wplug, entry);
1807 if (zwplug)
1808 list_del_init(&zwplug->entry);
1809 spin_unlock_irq(&disk->zone_wplugs_list_lock);
1810
1811 return zwplug;
1812 }
1813
disk_zone_wplugs_worker(void * data)1814 static int disk_zone_wplugs_worker(void *data)
1815 {
1816 struct gendisk *disk = data;
1817 struct blk_zone_wplug *zwplug;
1818 unsigned int noio_flag;
1819
1820 noio_flag = memalloc_noio_save();
1821 set_user_nice(current, MIN_NICE);
1822 set_freezable();
1823
1824 for (;;) {
1825 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1826
1827 zwplug = disk_get_zone_wplugs_work(disk);
1828 if (zwplug) {
1829 /*
1830 * Process all BIOs of this zone write plug and then
1831 * drop the reference we took when adding the zone write
1832 * plug to the active list.
1833 */
1834 set_current_state(TASK_RUNNING);
1835 while (disk_zone_wplug_submit_bio(disk, zwplug))
1836 blk_wait_io(&disk->zone_wplugs_worker_bio_done);
1837 disk_put_zone_wplug(zwplug);
1838 continue;
1839 }
1840
1841 /*
1842 * Only sleep if nothing sets the state to running. Else check
1843 * for zone write plugs work again as a newly submitted BIO
1844 * might have added a zone write plug to the work list.
1845 */
1846 if (get_current_state() == TASK_RUNNING) {
1847 try_to_freeze();
1848 } else {
1849 if (kthread_should_stop()) {
1850 set_current_state(TASK_RUNNING);
1851 break;
1852 }
1853 schedule();
1854 }
1855 }
1856
1857 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1858 memalloc_noio_restore(noio_flag);
1859
1860 return 0;
1861 }
1862
disk_init_zone_resources(struct gendisk * disk)1863 void disk_init_zone_resources(struct gendisk *disk)
1864 {
1865 spin_lock_init(&disk->zone_wplugs_hash_lock);
1866 spin_lock_init(&disk->zone_wplugs_list_lock);
1867 INIT_LIST_HEAD(&disk->zone_wplugs_list);
1868 init_completion(&disk->zone_wplugs_worker_bio_done);
1869 }
1870
1871 /*
1872 * For the size of a disk zone write plug hash table, use the size of the
1873 * zone write plug mempool, which is the maximum of the disk open zones and
1874 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1875 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1876 */
1877 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1878 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1879
disk_alloc_zone_resources(struct gendisk * disk,unsigned int pool_size)1880 static int disk_alloc_zone_resources(struct gendisk *disk,
1881 unsigned int pool_size)
1882 {
1883 unsigned int i;
1884 int ret = -ENOMEM;
1885
1886 atomic_set(&disk->nr_zone_wplugs, 0);
1887 disk->zone_wplugs_hash_bits =
1888 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1889
1890 disk->zone_wplugs_hash =
1891 kzalloc_objs(struct hlist_head,
1892 disk_zone_wplugs_hash_size(disk));
1893 if (!disk->zone_wplugs_hash)
1894 return -ENOMEM;
1895
1896 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1897 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1898
1899 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1900 sizeof(struct blk_zone_wplug));
1901 if (!disk->zone_wplugs_pool)
1902 goto free_hash;
1903
1904 disk->zone_wplugs_wq =
1905 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1906 pool_size, disk->disk_name);
1907 if (!disk->zone_wplugs_wq)
1908 goto destroy_pool;
1909
1910 disk->zone_wplugs_worker =
1911 kthread_create(disk_zone_wplugs_worker, disk,
1912 "%s_zwplugs_worker", disk->disk_name);
1913 if (IS_ERR(disk->zone_wplugs_worker)) {
1914 ret = PTR_ERR(disk->zone_wplugs_worker);
1915 disk->zone_wplugs_worker = NULL;
1916 goto destroy_wq;
1917 }
1918 wake_up_process(disk->zone_wplugs_worker);
1919
1920 return 0;
1921
1922 destroy_wq:
1923 destroy_workqueue(disk->zone_wplugs_wq);
1924 disk->zone_wplugs_wq = NULL;
1925 destroy_pool:
1926 mempool_destroy(disk->zone_wplugs_pool);
1927 disk->zone_wplugs_pool = NULL;
1928 free_hash:
1929 kfree(disk->zone_wplugs_hash);
1930 disk->zone_wplugs_hash = NULL;
1931 disk->zone_wplugs_hash_bits = 0;
1932 return ret;
1933 }
1934
disk_destroy_zone_wplugs_hash_table(struct gendisk * disk)1935 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1936 {
1937 struct blk_zone_wplug *zwplug;
1938 unsigned int i;
1939
1940 if (!disk->zone_wplugs_hash)
1941 return;
1942
1943 /* Free all the zone write plugs we have. */
1944 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1945 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1946 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1947 struct blk_zone_wplug, node);
1948 spin_lock_irq(&zwplug->lock);
1949 disk_mark_zone_wplug_dead(zwplug);
1950 spin_unlock_irq(&zwplug->lock);
1951 }
1952 }
1953
1954 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1955 kfree(disk->zone_wplugs_hash);
1956 disk->zone_wplugs_hash = NULL;
1957 disk->zone_wplugs_hash_bits = 0;
1958
1959 /*
1960 * Wait for the zone write plugs to be RCU-freed before destroying the
1961 * mempool.
1962 */
1963 rcu_barrier();
1964 mempool_destroy(disk->zone_wplugs_pool);
1965 disk->zone_wplugs_pool = NULL;
1966 }
1967
disk_set_zones_cond_array(struct gendisk * disk,u8 * zones_cond)1968 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1969 {
1970 unsigned long flags;
1971
1972 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
1973 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1974 lockdep_is_held(&disk->zone_wplugs_hash_lock));
1975 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
1976
1977 kfree_rcu_mightsleep(zones_cond);
1978 }
1979
disk_free_zone_resources(struct gendisk * disk)1980 void disk_free_zone_resources(struct gendisk *disk)
1981 {
1982 if (disk->zone_wplugs_worker)
1983 kthread_stop(disk->zone_wplugs_worker);
1984 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1985
1986 if (disk->zone_wplugs_wq) {
1987 destroy_workqueue(disk->zone_wplugs_wq);
1988 disk->zone_wplugs_wq = NULL;
1989 }
1990
1991 disk_destroy_zone_wplugs_hash_table(disk);
1992
1993 disk_set_zones_cond_array(disk, NULL);
1994 disk->zone_capacity = 0;
1995 disk->last_zone_capacity = 0;
1996 disk->nr_zones = 0;
1997 }
1998
1999 struct blk_revalidate_zone_args {
2000 struct gendisk *disk;
2001 u8 *zones_cond;
2002 unsigned int nr_zones;
2003 unsigned int nr_conv_zones;
2004 unsigned int zone_capacity;
2005 unsigned int last_zone_capacity;
2006 sector_t sector;
2007 };
2008
disk_revalidate_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2009 static int disk_revalidate_zone_resources(struct gendisk *disk,
2010 struct blk_revalidate_zone_args *args)
2011 {
2012 struct queue_limits *lim = &disk->queue->limits;
2013 unsigned int pool_size;
2014 int ret = 0;
2015
2016 args->disk = disk;
2017 args->nr_zones =
2018 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
2019
2020 /* Cached zone conditions: 1 byte per zone */
2021 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
2022 if (!args->zones_cond)
2023 return -ENOMEM;
2024
2025 if (!disk_need_zone_resources(disk))
2026 return 0;
2027
2028 /*
2029 * If the device has no limit on the maximum number of open and active
2030 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
2031 */
2032 pool_size = max(lim->max_open_zones, lim->max_active_zones);
2033 if (!pool_size)
2034 pool_size =
2035 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
2036
2037 if (!disk->zone_wplugs_hash) {
2038 ret = disk_alloc_zone_resources(disk, pool_size);
2039 if (ret)
2040 kfree(args->zones_cond);
2041 }
2042
2043 return ret;
2044 }
2045
2046 /*
2047 * Update the disk zone resources information and device queue limits.
2048 * The disk queue is frozen when this is executed.
2049 */
disk_update_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2050 static int disk_update_zone_resources(struct gendisk *disk,
2051 struct blk_revalidate_zone_args *args)
2052 {
2053 struct request_queue *q = disk->queue;
2054 unsigned int nr_seq_zones;
2055 unsigned int pool_size, memflags;
2056 struct queue_limits lim;
2057 int ret = 0;
2058
2059 lim = queue_limits_start_update(q);
2060
2061 memflags = blk_mq_freeze_queue(q);
2062
2063 disk->nr_zones = args->nr_zones;
2064 if (args->nr_conv_zones >= disk->nr_zones) {
2065 queue_limits_cancel_update(q);
2066 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
2067 disk->disk_name, args->nr_conv_zones, disk->nr_zones);
2068 ret = -ENODEV;
2069 goto unfreeze;
2070 }
2071
2072 disk->zone_capacity = args->zone_capacity;
2073 disk->last_zone_capacity = args->last_zone_capacity;
2074 disk_set_zones_cond_array(disk, args->zones_cond);
2075 args->zones_cond = NULL;
2076
2077 /*
2078 * Some devices can advertise zone resource limits that are larger than
2079 * the number of sequential zones of the zoned block device, e.g. a
2080 * small ZNS namespace. For such case, assume that the zoned device has
2081 * no zone resource limits.
2082 */
2083 nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
2084 if (lim.max_open_zones >= nr_seq_zones)
2085 lim.max_open_zones = 0;
2086 if (lim.max_active_zones >= nr_seq_zones)
2087 lim.max_active_zones = 0;
2088
2089 if (!disk->zone_wplugs_pool)
2090 goto commit;
2091
2092 /*
2093 * If the device has no limit on the maximum number of open and active
2094 * zones, set its max open zone limit to the mempool size to indicate
2095 * to the user that there is a potential performance impact due to
2096 * dynamic zone write plug allocation when simultaneously writing to
2097 * more zones than the size of the mempool.
2098 */
2099 pool_size = max(lim.max_open_zones, lim.max_active_zones);
2100 if (!pool_size)
2101 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
2102
2103 mempool_resize(disk->zone_wplugs_pool, pool_size);
2104
2105 if (!lim.max_open_zones && !lim.max_active_zones) {
2106 if (pool_size < nr_seq_zones)
2107 lim.max_open_zones = pool_size;
2108 else
2109 lim.max_open_zones = 0;
2110 }
2111
2112 commit:
2113 ret = queue_limits_commit_update(q, &lim);
2114
2115 unfreeze:
2116 if (ret)
2117 disk_free_zone_resources(disk);
2118
2119 blk_mq_unfreeze_queue(q, memflags);
2120
2121 return ret;
2122 }
2123
blk_revalidate_zone_cond(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2124 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2125 struct blk_revalidate_zone_args *args)
2126 {
2127 enum blk_zone_cond cond = zone->cond;
2128
2129 /* Check that the zone condition is consistent with the zone type. */
2130 switch (cond) {
2131 case BLK_ZONE_COND_NOT_WP:
2132 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2133 goto invalid_condition;
2134 break;
2135 case BLK_ZONE_COND_IMP_OPEN:
2136 case BLK_ZONE_COND_EXP_OPEN:
2137 case BLK_ZONE_COND_CLOSED:
2138 case BLK_ZONE_COND_EMPTY:
2139 case BLK_ZONE_COND_FULL:
2140 case BLK_ZONE_COND_OFFLINE:
2141 case BLK_ZONE_COND_READONLY:
2142 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2143 goto invalid_condition;
2144 break;
2145 default:
2146 pr_warn("%s: Invalid zone condition 0x%X\n",
2147 args->disk->disk_name, cond);
2148 return -ENODEV;
2149 }
2150
2151 blk_zone_set_cond(args->zones_cond, idx, cond);
2152
2153 return 0;
2154
2155 invalid_condition:
2156 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2157 args->disk->disk_name, cond, zone->type);
2158
2159 return -ENODEV;
2160 }
2161
blk_revalidate_conv_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2162 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2163 struct blk_revalidate_zone_args *args)
2164 {
2165 struct gendisk *disk = args->disk;
2166
2167 if (zone->capacity != zone->len) {
2168 pr_warn("%s: Invalid conventional zone capacity\n",
2169 disk->disk_name);
2170 return -ENODEV;
2171 }
2172
2173 if (disk_zone_is_last(disk, zone))
2174 args->last_zone_capacity = zone->capacity;
2175
2176 args->nr_conv_zones++;
2177
2178 return 0;
2179 }
2180
blk_revalidate_seq_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2181 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2182 struct blk_revalidate_zone_args *args)
2183 {
2184 struct gendisk *disk = args->disk;
2185 struct blk_zone_wplug *zwplug;
2186 unsigned int wp_offset;
2187
2188 /*
2189 * Remember the capacity of the first sequential zone and check
2190 * if it is constant for all zones, ignoring the last zone as it can be
2191 * smaller.
2192 */
2193 if (!args->zone_capacity)
2194 args->zone_capacity = zone->capacity;
2195 if (disk_zone_is_last(disk, zone)) {
2196 args->last_zone_capacity = zone->capacity;
2197 } else if (zone->capacity != args->zone_capacity) {
2198 pr_warn("%s: Invalid variable zone capacity\n",
2199 disk->disk_name);
2200 return -ENODEV;
2201 }
2202
2203 /*
2204 * If the device needs zone append emulation, we need to track the
2205 * write pointer of all zones that are not empty nor full. So make sure
2206 * we have a zone write plug for such zone if the device has a zone
2207 * write plug hash table.
2208 */
2209 if (!disk->zone_wplugs_hash)
2210 return 0;
2211
2212 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2213 if (!wp_offset || wp_offset >= zone->capacity)
2214 return 0;
2215
2216 zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
2217 if (!zwplug)
2218 return -ENOMEM;
2219 disk_put_zone_wplug(zwplug);
2220
2221 return 0;
2222 }
2223
2224 /*
2225 * Helper function to check the validity of zones of a zoned block device.
2226 */
blk_revalidate_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)2227 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2228 void *data)
2229 {
2230 struct blk_revalidate_zone_args *args = data;
2231 struct gendisk *disk = args->disk;
2232 sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2233 int ret;
2234
2235 /* Check for bad zones and holes in the zone report */
2236 if (zone->start != args->sector) {
2237 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2238 disk->disk_name, args->sector, zone->start);
2239 return -ENODEV;
2240 }
2241
2242 if (zone->start >= get_capacity(disk) || !zone->len) {
2243 pr_warn("%s: Invalid zone start %llu, length %llu\n",
2244 disk->disk_name, zone->start, zone->len);
2245 return -ENODEV;
2246 }
2247
2248 /*
2249 * All zones must have the same size, with the exception on an eventual
2250 * smaller last zone.
2251 */
2252 if (!disk_zone_is_last(disk, zone)) {
2253 if (zone->len != zone_sectors) {
2254 pr_warn("%s: Invalid zoned device with non constant zone size\n",
2255 disk->disk_name);
2256 return -ENODEV;
2257 }
2258 } else if (zone->len > zone_sectors) {
2259 pr_warn("%s: Invalid zoned device with larger last zone size\n",
2260 disk->disk_name);
2261 return -ENODEV;
2262 }
2263
2264 if (!zone->capacity || zone->capacity > zone->len) {
2265 pr_warn("%s: Invalid zone capacity\n",
2266 disk->disk_name);
2267 return -ENODEV;
2268 }
2269
2270 /* Check zone condition */
2271 ret = blk_revalidate_zone_cond(zone, idx, args);
2272 if (ret)
2273 return ret;
2274
2275 /* Check zone type */
2276 switch (zone->type) {
2277 case BLK_ZONE_TYPE_CONVENTIONAL:
2278 ret = blk_revalidate_conv_zone(zone, idx, args);
2279 break;
2280 case BLK_ZONE_TYPE_SEQWRITE_REQ:
2281 ret = blk_revalidate_seq_zone(zone, idx, args);
2282 break;
2283 case BLK_ZONE_TYPE_SEQWRITE_PREF:
2284 default:
2285 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2286 disk->disk_name, (int)zone->type, zone->start);
2287 ret = -ENODEV;
2288 }
2289
2290 if (!ret)
2291 args->sector += zone->len;
2292
2293 return ret;
2294 }
2295
2296 /**
2297 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2298 * @disk: Target disk
2299 *
2300 * Helper function for low-level device drivers to check, (re) allocate and
2301 * initialize resources used for managing zoned disks. This function should
2302 * normally be called by blk-mq based drivers when a zoned gendisk is probed
2303 * and when the zone configuration of the gendisk changes (e.g. after a format).
2304 * Before calling this function, the device driver must already have set the
2305 * device zone size (chunk_sector limit) and the max zone append limit.
2306 * BIO based drivers can also use this function as long as the device queue
2307 * can be safely frozen.
2308 */
blk_revalidate_disk_zones(struct gendisk * disk)2309 int blk_revalidate_disk_zones(struct gendisk *disk)
2310 {
2311 struct request_queue *q = disk->queue;
2312 sector_t zone_sectors = q->limits.chunk_sectors;
2313 sector_t capacity = get_capacity(disk);
2314 struct blk_revalidate_zone_args args = { };
2315 unsigned int memflags, noio_flag;
2316 struct blk_report_zones_args rep_args = {
2317 .cb = blk_revalidate_zone_cb,
2318 .data = &args,
2319 };
2320 int ret = -ENOMEM;
2321
2322 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2323 return -EIO;
2324
2325 if (!capacity)
2326 return -ENODEV;
2327
2328 /*
2329 * Checks that the device driver indicated a valid zone size and that
2330 * the max zone append limit is set.
2331 */
2332 if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2333 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2334 disk->disk_name, zone_sectors);
2335 return -ENODEV;
2336 }
2337
2338 /*
2339 * Ensure that all memory allocations in this context are done as if
2340 * GFP_NOIO was specified.
2341 */
2342 noio_flag = memalloc_noio_save();
2343 ret = disk_revalidate_zone_resources(disk, &args);
2344 if (ret) {
2345 memalloc_noio_restore(noio_flag);
2346 return ret;
2347 }
2348
2349 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2350 if (!ret) {
2351 pr_warn("%s: No zones reported\n", disk->disk_name);
2352 ret = -ENODEV;
2353 }
2354 memalloc_noio_restore(noio_flag);
2355
2356 if (ret <= 0)
2357 goto free_resources;
2358
2359 /*
2360 * If zones where reported, make sure that the entire disk capacity
2361 * has been checked.
2362 */
2363 if (args.sector != capacity) {
2364 pr_warn("%s: Missing zones from sector %llu\n",
2365 disk->disk_name, args.sector);
2366 ret = -ENODEV;
2367 goto free_resources;
2368 }
2369
2370 ret = disk_update_zone_resources(disk, &args);
2371 if (ret)
2372 goto free_resources;
2373
2374 return 0;
2375
2376 free_resources:
2377 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2378
2379 kfree(args.zones_cond);
2380 memflags = blk_mq_freeze_queue(q);
2381 disk_free_zone_resources(disk);
2382 blk_mq_unfreeze_queue(q, memflags);
2383
2384 return ret;
2385 }
2386 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2387
2388 /**
2389 * blk_zone_issue_zeroout - zero-fill a block range in a zone
2390 * @bdev: blockdev to write
2391 * @sector: start sector
2392 * @nr_sects: number of sectors to write
2393 * @gfp_mask: memory allocation flags (for bio_alloc)
2394 *
2395 * Description:
2396 * Zero-fill a block range in a zone (@sector must be equal to the zone write
2397 * pointer), handling potential errors due to the (initially unknown) lack of
2398 * hardware offload (See blkdev_issue_zeroout()).
2399 */
blk_zone_issue_zeroout(struct block_device * bdev,sector_t sector,sector_t nr_sects,gfp_t gfp_mask)2400 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2401 sector_t nr_sects, gfp_t gfp_mask)
2402 {
2403 struct gendisk *disk = bdev->bd_disk;
2404 int ret;
2405
2406 if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2407 return -EIO;
2408
2409 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2410 BLKDEV_ZERO_NOFALLBACK);
2411 if (ret != -EOPNOTSUPP)
2412 return ret;
2413
2414 /*
2415 * The failed call to blkdev_issue_zeroout() advanced the zone write
2416 * pointer. Undo this using a report zone to update the zone write
2417 * pointer to the correct current value.
2418 */
2419 ret = disk->fops->report_zones(disk, sector, 1, NULL);
2420 if (ret != 1)
2421 return ret < 0 ? ret : -EIO;
2422
2423 /*
2424 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2425 * regular write with zero-pages.
2426 */
2427 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2428 }
2429 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2430
2431 #ifdef CONFIG_BLK_DEBUG_FS
queue_zone_wplug_show(struct blk_zone_wplug * zwplug,struct seq_file * m)2432 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2433 struct seq_file *m)
2434 {
2435 unsigned int zwp_wp_offset, zwp_flags;
2436 unsigned int zwp_zone_no, zwp_ref;
2437 unsigned int zwp_bio_list_size;
2438 enum blk_zone_cond zwp_cond;
2439 unsigned long flags;
2440
2441 spin_lock_irqsave(&zwplug->lock, flags);
2442 zwp_zone_no = zwplug->zone_no;
2443 zwp_flags = zwplug->flags;
2444 zwp_ref = refcount_read(&zwplug->ref);
2445 zwp_cond = zwplug->cond;
2446 zwp_wp_offset = zwplug->wp_offset;
2447 zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2448 spin_unlock_irqrestore(&zwplug->lock, flags);
2449
2450 seq_printf(m,
2451 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2452 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2453 zwp_wp_offset, zwp_bio_list_size);
2454 }
2455
queue_zone_wplugs_show(void * data,struct seq_file * m)2456 int queue_zone_wplugs_show(void *data, struct seq_file *m)
2457 {
2458 struct request_queue *q = data;
2459 struct gendisk *disk = q->disk;
2460 struct blk_zone_wplug *zwplug;
2461 unsigned int i;
2462
2463 if (!disk->zone_wplugs_hash)
2464 return 0;
2465
2466 rcu_read_lock();
2467 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2468 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2469 node)
2470 queue_zone_wplug_show(zwplug, m);
2471 rcu_read_unlock();
2472
2473 return 0;
2474 }
2475
2476 #endif
2477