drivers/md/dm.c

1 // SPDX-License-Identifier: GPL-2.0-only
4  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
9 #include "dm-core.h"
10 #include "dm-rq.h"
11 #include "dm-uevent.h"
12 #include "dm-ima.h"
14 #include <linux/bio-integrity.h>
33 #include <linux/blk-crypto.h>
34 #include <linux/blk-crypto-profile.h>
47  * dm_io into one list, and reuse bio->bi_private as the list head. Before
48  * ending this fs bio, we will recover its ->bi_private.
81  * One of these is allocated (on-stack) per original bio.
101 		return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;  in dm_per_bio_data()
102 	return (char *)bio - DM_IO_BIO_OFFSET - data_size;  in dm_per_bio_data()
110 	if (io->magic == DM_IO_MAGIC)  in dm_bio_from_per_bio_data()
112 	BUG_ON(io->magic != DM_TIO_MAGIC);  in dm_bio_from_per_bio_data()
119 	return container_of(bio, struct dm_target_io, clone)->target_bio_nr;  in dm_bio_get_target_bio_nr()
123 #define MINOR_ALLOCED ((void *)-1)
146  * Bio-based DM's mempools' reserved IOs set by the user.
200 					 DM_NUMA_NODE, num_online_nodes() - 1);  in dm_get_numa_node()
213 		r = -ENOMEM;  in local_init()
287 	while (i--)  in dm_init()
297 	while (i--)  in dm_exit()
311 	return test_bit(DMF_DELETING, &md->flags);  in dm_deleting_md()
314 static int dm_blk_open(struct gendisk *disk, blk_mode_t mode)  in dm_blk_open()  argument
320 	md = disk->private_data;  in dm_blk_open()
324 	if (test_bit(DMF_FREEING, &md->flags) ||  in dm_blk_open()
331 	atomic_inc(&md->open_count);  in dm_blk_open()
335 	return md ? 0 : -ENXIO;  in dm_blk_open()
338 static void dm_blk_close(struct gendisk *disk)  in dm_blk_close()  argument
344 	md = disk->private_data;  in dm_blk_close()
348 	if (atomic_dec_and_test(&md->open_count) &&  in dm_blk_close()
349 	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))  in dm_blk_close()
359 	return atomic_read(&md->open_count);  in dm_open_count()
372 		r = -EBUSY;  in dm_lock_for_deletion()
374 			set_bit(DMF_DEFERRED_REMOVE, &md->flags);  in dm_lock_for_deletion()
375 	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))  in dm_lock_for_deletion()
376 		r = -EEXIST;  in dm_lock_for_deletion()
378 		set_bit(DMF_DELETING, &md->flags);  in dm_lock_for_deletion()
391 	if (test_bit(DMF_DELETING, &md->flags))  in dm_cancel_deferred_remove()
392 		r = -EBUSY;  in dm_cancel_deferred_remove()
394 		clear_bit(DMF_DEFERRED_REMOVE, &md->flags);  in dm_cancel_deferred_remove()
408 	struct mapped_device *md = bdev->bd_disk->private_data;  in dm_blk_getgeo()
422 	r = -ENOTTY;  in dm_prepare_ioctl()
428 	if (map->num_targets != 1)  in dm_prepare_ioctl()
432 	if (!ti->type->prepare_ioctl)  in dm_prepare_ioctl()
436 		return -EAGAIN;  in dm_prepare_ioctl()
438 	r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward);  in dm_prepare_ioctl()
439 	if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) {  in dm_prepare_ioctl()
456 	struct mapped_device *md = bdev->bd_disk->private_data;  in dm_blk_ioctl()
471 	"%s: sending ioctl %x to DM device without required privilege.",  in dm_blk_ioctl()
472 				current->comm, cmd);  in dm_blk_ioctl()
473 			r = -ENOIOCTLCMD;  in dm_blk_ioctl()
478 	if (!bdev->bd_disk->fops->ioctl)  in dm_blk_ioctl()
479 		r = -ENOTTY;  in dm_blk_ioctl()
481 		r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);  in dm_blk_ioctl()
489 	return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);  in dm_start_time_ns_from_clone()
495 	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);  in bio_is_flush_with_data()
507 		return io->sectors;  in dm_io_sectors()
513 	struct bio *bio = io->orig_bio;  in dm_io_acct()
517 			bdev_start_io_acct(bio->bi_bdev, bio_op(bio),  in dm_io_acct()
518 					   io->start_time);  in dm_io_acct()
520 			bdev_end_io_acct(bio->bi_bdev, bio_op(bio),  in dm_io_acct()
522 					 io->start_time);  in dm_io_acct()
526 	    unlikely(dm_stats_used(&io->md->stats))) {  in dm_io_acct()
530 			sector = bio_end_sector(bio) - io->sector_offset;  in dm_io_acct()
532 			sector = bio->bi_iter.bi_sector;  in dm_io_acct()
534 		dm_stats_account_io(&io->md->stats, bio_data_dir(bio),  in dm_io_acct()
536 				    end, io->start_time, &io->stats_aux);  in dm_io_acct()
559 		spin_lock_irqsave(&io->lock, flags);  in dm_start_io_acct()
561 			spin_unlock_irqrestore(&io->lock, flags);  in dm_start_io_acct()
565 		spin_unlock_irqrestore(&io->lock, flags);  in dm_start_io_acct()
582 	clone = bio_alloc_clone(NULL, bio, gfp_mask, &md->mempools->io_bs);  in alloc_io()
586 	tio->flags = 0;  in alloc_io()
588 	tio->io = NULL;  in alloc_io()
591 	io->magic = DM_IO_MAGIC;  in alloc_io()
592 	io->status = BLK_STS_OK;  in alloc_io()
595 	atomic_set(&io->io_count, 2);  in alloc_io()
596 	this_cpu_inc(*md->pending_io);  in alloc_io()
597 	io->orig_bio = bio;  in alloc_io()
598 	io->md = md;  in alloc_io()
599 	spin_lock_init(&io->lock);  in alloc_io()
600 	io->start_time = jiffies;  in alloc_io()
601 	io->flags = 0;  in alloc_io()
602 	if (blk_queue_io_stat(md->queue))  in alloc_io()
606 	    unlikely(dm_stats_used(&md->stats)))  in alloc_io()
607 		dm_stats_record_start(&md->stats, &io->stats_aux);  in alloc_io()
614 	bio_put(&io->tio.clone);  in free_io()
620 	struct mapped_device *md = ci->io->md;  in alloc_tio()
624 	if (!ci->io->tio.io) {  in alloc_tio()
625 		/* the dm_target_io embedded in ci->io is available */  in alloc_tio()
626 		tio = &ci->io->tio;  in alloc_tio()
628 		clone = &tio->clone;  in alloc_tio()
630 		clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,  in alloc_tio()
631 					&md->mempools->bs);  in alloc_tio()
636 		clone->bi_opf &= ~REQ_DM_POLL_LIST;  in alloc_tio()
639 		tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */  in alloc_tio()
642 	tio->magic = DM_TIO_MAGIC;  in alloc_tio()
643 	tio->io = ci->io;  in alloc_tio()
644 	tio->ti = ti;  in alloc_tio()
645 	tio->target_bio_nr = target_bio_nr;  in alloc_tio()
646 	tio->len_ptr = len;  in alloc_tio()
647 	tio->old_sector = 0;  in alloc_tio()
650 	clone->bi_bdev = md->disk->part0;  in alloc_tio()
651 	if (likely(ti != NULL) && unlikely(ti->needs_bio_set_dev))  in alloc_tio()
652 		bio_set_dev(clone, md->disk->part0);  in alloc_tio()
655 		clone->bi_iter.bi_size = to_bytes(*len);  in alloc_tio()
671  * Add the bio to the list of deferred io.
677 	spin_lock_irqsave(&md->deferred_lock, flags);  in queue_io()
678 	bio_list_add(&md->deferred, bio);  in queue_io()
679 	spin_unlock_irqrestore(&md->deferred_lock, flags);  in queue_io()
680 	queue_work(md->wq, &md->work);  in queue_io()
685  * function to access the md->map field, and make sure they call
689 				   int *srcu_idx) __acquires(md->io_barrier)  in dm_get_live_table()
691 	*srcu_idx = srcu_read_lock(&md->io_barrier);  in dm_get_live_table()
693 	return srcu_dereference(md->map, &md->io_barrier);  in dm_get_live_table()
697 		       int srcu_idx) __releases(md->io_barrier)  in dm_put_live_table()
699 	srcu_read_unlock(&md->io_barrier, srcu_idx);  in dm_put_live_table()
704 	synchronize_srcu(&md->io_barrier);  in dm_sync_table()
709  * A fast alternative to dm_get_live_table/dm_put_live_table.
715 	return rcu_dereference(md->map);  in dm_get_live_table_fast()
723 static char *_dm_claim_ptr = "I belong to device-mapper";
737 	td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);  in open_table_device()
739 		return ERR_PTR(-ENOMEM);  in open_table_device()
740 	refcount_set(&td->count, 1);  in open_table_device()
751 	 * We can be called before the dm disk is added.  In that case we can't  in open_table_device()
755 	if (md->disk->slave_dir) {  in open_table_device()
756 		r = bd_link_disk_holder(bdev, md->disk);  in open_table_device()
761 	td->dm_dev.mode = mode;  in open_table_device()
762 	td->dm_dev.bdev = bdev;  in open_table_device()
763 	td->dm_dev.bdev_file = bdev_file;  in open_table_device()
764 	td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off,  in open_table_device()
766 	format_dev_t(td->dm_dev.name, dev);  in open_table_device()
767 	list_add(&td->list, &md->table_devices);  in open_table_device()
782 	if (md->disk->slave_dir)  in close_table_device()
783 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);  in close_table_device()
786 	if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))  in close_table_device()
787 		fput(td->dm_dev.bdev_file);  in close_table_device()
789 		__fput_sync(td->dm_dev.bdev_file);  in close_table_device()
791 	put_dax(td->dm_dev.dax_dev);  in close_table_device()
792 	list_del(&td->list);  in close_table_device()
802 		if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)  in find_table_device()
813 	mutex_lock(&md->table_devices_lock);  in dm_get_table_device()
814 	td = find_table_device(&md->table_devices, dev, mode);  in dm_get_table_device()
818 			mutex_unlock(&md->table_devices_lock);  in dm_get_table_device()
822 		refcount_inc(&td->count);  in dm_get_table_device()
824 	mutex_unlock(&md->table_devices_lock);  in dm_get_table_device()
826 	*result = &td->dm_dev;  in dm_get_table_device()
834 	mutex_lock(&md->table_devices_lock);  in dm_put_table_device()
835 	if (refcount_dec_and_test(&td->count))  in dm_put_table_device()
837 	mutex_unlock(&md->table_devices_lock);  in dm_put_table_device()
845 	*geo = md->geometry;  in dm_get_geometry()
855 	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;  in dm_set_geometry()
857 	if (geo->start > sz) {  in dm_set_geometry()
859 		return -EINVAL;  in dm_set_geometry()
862 	md->geometry = *geo;  in dm_set_geometry()
869 	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);  in __noflush_suspending()
874 	struct mapped_device *md = io->md;  in dm_requeue_add_io()
877 		struct dm_io *next = md->requeue_list;  in dm_requeue_add_io()
879 		md->requeue_list = io;  in dm_requeue_add_io()
880 		io->next = next;  in dm_requeue_add_io()
882 		bio_list_add_head(&md->deferred, io->orig_bio);  in dm_requeue_add_io()
889 		queue_work(md->wq, &md->requeue_work);  in dm_kick_requeue()
891 		queue_work(md->wq, &md->work);  in dm_kick_requeue()
896  * io->status is updated with error if requeue disallowed.
900 	struct bio *bio = io->orig_bio;  in dm_handle_requeue()
901 	bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);  in dm_handle_requeue()
902 	bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&  in dm_handle_requeue()
903 				     (bio->bi_opf & REQ_POLLED));  in dm_handle_requeue()
904 	struct mapped_device *md = io->md;  in dm_handle_requeue()
910 		if (bio->bi_opf & REQ_POLLED) {  in dm_handle_requeue()
913 			 * (io->orig_bio may only reflect a subset of the  in dm_handle_requeue()
914 			 * pre-split original) so clear REQ_POLLED.  in dm_handle_requeue()
923 		spin_lock_irqsave(&md->deferred_lock, flags);  in dm_handle_requeue()
931 			 * noflush suspend was interrupted or this is  in dm_handle_requeue()
932 			 * a write to a zoned target.  in dm_handle_requeue()
934 			io->status = BLK_STS_IOERR;  in dm_handle_requeue()
936 		spin_unlock_irqrestore(&md->deferred_lock, flags);  in dm_handle_requeue()
947 	struct bio *bio = io->orig_bio;  in __dm_io_complete()
948 	struct mapped_device *md = io->md;  in __dm_io_complete()
956 	io_error = io->status;  in __dm_io_complete()
961 		 * Must handle target that DM_MAPIO_SUBMITTED only to  in __dm_io_complete()
969 	this_cpu_dec(*md->pending_io);  in __dm_io_complete()
971 	/* nudge anyone waiting on suspend queue */  in __dm_io_complete()
972 	if (unlikely(wq_has_sleeper(&md->wait)))  in __dm_io_complete()
973 		wake_up(&md->wait);  in __dm_io_complete()
984 		bio->bi_opf &= ~REQ_PREFLUSH;  in __dm_io_complete()
989 			bio->bi_status = io_error;  in __dm_io_complete()
1001 	/* reuse deferred lock to simplify dm_handle_requeue */  in dm_wq_requeue_work()
1002 	spin_lock_irqsave(&md->deferred_lock, flags);  in dm_wq_requeue_work()
1003 	io = md->requeue_list;  in dm_wq_requeue_work()
1004 	md->requeue_list = NULL;  in dm_wq_requeue_work()
1005 	spin_unlock_irqrestore(&md->deferred_lock, flags);  in dm_wq_requeue_work()
1008 		struct dm_io *next = io->next;  in dm_wq_requeue_work()
1010 		dm_io_rewind(io, &md->disk->bio_split);  in dm_wq_requeue_work()
1012 		io->next = NULL;  in dm_wq_requeue_work()
1022  * 1) io->orig_bio points to the real original bio, and the part mapped to
1025  * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
1031 	 * we may run into long bio clone chain during suspend and OOM could  in dm_io_complete()
1046 	if (atomic_dec_and_test(&io->io_count))  in __dm_io_dec_pending()
1054 	/* Push-back supersedes any I/O errors */  in dm_io_set_error()
1055 	spin_lock_irqsave(&io->lock, flags);  in dm_io_set_error()
1056 	if (!(io->status == BLK_STS_DM_REQUEUE &&  in dm_io_set_error()
1057 	      __noflush_suspending(io->md))) {  in dm_io_set_error()
1058 		io->status = error;  in dm_io_set_error()
1060 	spin_unlock_irqrestore(&io->lock, flags);  in dm_io_set_error()
1073  * count on 'md'. But _not_ imposing verification to avoid atomic_read(),
1077 	return &md->queue->limits;  in dm_get_queue_limits()
1082 	return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);  in swap_bios_limit()
1087 	blk_status_t error = bio->bi_status;  in clone_endio()
1089 	struct dm_target *ti = tio->ti;  in clone_endio()
1090 	dm_endio_fn endio = likely(ti != NULL) ? ti->type->end_io : NULL;  in clone_endio()
1091 	struct dm_io *io = tio->io;  in clone_endio()
1092 	struct mapped_device *md = io->md;  in clone_endio()
1096 		    !bdev_max_discard_sectors(bio->bi_bdev))  in clone_endio()
1097 			blk_queue_disable_discard(md->queue);  in clone_endio()
1099 			 !bdev_write_zeroes_sectors(bio->bi_bdev))  in clone_endio()
1100 			blk_queue_disable_write_zeroes(md->queue);  in clone_endio()
1104 	    unlikely(bdev_is_zoned(bio->bi_bdev)))  in clone_endio()
1114 				 * Requeuing writes to a sequential zone of a zoned  in clone_endio()
1138 		up(&md->swap_bios_semaphore);  in clone_endio()
1145  * Return maximum size of I/O possible at the supplied sector up to the current
1151 	return ti->len - target_offset;  in max_io_len_target_boundary()
1162 	 * Does the target need to split IO even further?  in __max_io_len()
1163 	 * - varied (per target) IO splitting is a tenet of DM; this  in __max_io_len()
1170 		min(max_sectors ? : queue_max_sectors(ti->table->md->queue),  in __max_io_len()
1176 	return __max_io_len(ti, sector, ti->max_io_len, 0);  in max_io_len()
1184 		ti->error = "Maximum size of target IO is too large";  in dm_set_target_max_io_len()
1185 		return -EINVAL;  in dm_set_target_max_io_len()
1188 	ti->max_io_len = (uint32_t) len;  in dm_set_target_max_io_len()
1196 	__acquires(md->io_barrier)  in dm_dax_get_live_target()
1219 	long len, ret = -EIO;  in dm_dax_direct_access()
1226 	if (!ti->type->direct_access)  in dm_dax_direct_access()
1232 	ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);  in dm_dax_direct_access()
1246 	int ret = -EIO;  in dm_dax_zero_page_range()
1253 	if (WARN_ON(!ti->type->dax_zero_page_range)) {  in dm_dax_zero_page_range()
1255 		 * ->zero_page_range() is mandatory dax operation. If we are  in dm_dax_zero_page_range()
1260 	ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);  in dm_dax_zero_page_range()
1277 	if (!ti || !ti->type->dax_recovery_write)  in dm_dax_recovery_write()
1280 	ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);  in dm_dax_recovery_write()
1293  * dm_accept_partial_bio informs the dm that the target only wants to process
1298  * +--------------------+---------------+-------+
1300  * +--------------------+---------------+-------+
1302  * <-------------- *tio->len_ptr --------------->
1303  *                      <----- bio_sectors ----->
1304  *                      <-- n_sectors -->
1308  * Region 2 is the remaining bio size that the target wants to process.
1309  *	(it may be empty if region 1 is non-empty, although there is no reason
1310  *	 to make it empty)
1311  * The target requires that region 3 is to be sent in the next bio.
1313  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1320 	struct dm_io *io = tio->io;  in dm_accept_partial_bio()
1324 	BUG_ON(bio_sectors > *tio->len_ptr);  in dm_accept_partial_bio()
1328 	    unlikely(bdev_is_zoned(bio->bi_bdev))) {  in dm_accept_partial_bio()
1337 	*tio->len_ptr -= bio_sectors - n_sectors;  in dm_accept_partial_bio()
1338 	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;  in dm_accept_partial_bio()
1345 	io->sectors = n_sectors;  in dm_accept_partial_bio()
1346 	io->sector_offset = bio_sectors(io->orig_bio);  in dm_accept_partial_bio()
1351  * @clone: clone bio that DM core passed to target's .map function
1354  * Targets should use this interface to submit bios they take
1357  * Target should also enable ti->accounts_remapped_io
1362 	struct dm_io *io = tio->io;  in dm_submit_bio_remap()
1369 	 * Account io->origin_bio to DM dev on behalf of target  in dm_submit_bio_remap()
1374 	trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),  in dm_submit_bio_remap()
1375 			      tio->old_sector);  in dm_submit_bio_remap()
1382 	mutex_lock(&md->swap_bios_lock);  in __set_swap_bios_limit()
1383 	while (latch < md->swap_bios) {  in __set_swap_bios_limit()
1385 		down(&md->swap_bios_semaphore);  in __set_swap_bios_limit()
1386 		md->swap_bios--;  in __set_swap_bios_limit()
1388 	while (latch > md->swap_bios) {  in __set_swap_bios_limit()
1390 		up(&md->swap_bios_semaphore);  in __set_swap_bios_limit()
1391 		md->swap_bios++;  in __set_swap_bios_limit()
1393 	mutex_unlock(&md->swap_bios_lock);  in __set_swap_bios_limit()
1399 	struct dm_target *ti = tio->ti;  in __map_bio()
1400 	struct dm_io *io = tio->io;  in __map_bio()
1401 	struct mapped_device *md = io->md;  in __map_bio()
1404 	clone->bi_end_io = clone_endio;  in __map_bio()
1409 	tio->old_sector = clone->bi_iter.bi_sector;  in __map_bio()
1415 		if (unlikely(latch != md->swap_bios))  in __map_bio()
1417 		down(&md->swap_bios_semaphore);  in __map_bio()
1420 	if (likely(ti->type->map == linear_map))  in __map_bio()
1422 	else if (ti->type->map == stripe_map)  in __map_bio()
1425 		r = ti->type->map(ti, clone);  in __map_bio()
1430 		if (!ti->accounts_remapped_io)  in __map_bio()
1440 			up(&md->swap_bios_semaphore);  in __map_bio()
1455 	struct dm_io *io = ci->io;  in setup_split_accounting()
1457 	if (ci->sector_count > len) {  in setup_split_accounting()
1463 		io->sectors = len;  in setup_split_accounting()
1464 		io->sector_offset = bio_sectors(ci->bio);  in setup_split_accounting()
1479 			mutex_lock(&ci->io->md->table_devices_lock);  in alloc_multiple_bios()
1489 			mutex_unlock(&ci->io->md->table_devices_lock);  in alloc_multiple_bios()
1508 	/* dm_accept_partial_bio() is not supported with shared tio->len_ptr */  in __send_duplicate_bios()
1513 	 * Using alloc_multiple_bios(), even if num_bios is 1, to consistently  in __send_duplicate_bios()
1529 	struct dm_table *t = ci->map;  in __send_empty_flush()
1533 	if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==  in __send_empty_flush()
1538 	 * Use an on-stack bio for this, it's safe since we don't  in __send_empty_flush()
1539 	 * need to reference it after submit. It's just used as  in __send_empty_flush()
1542 	bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);  in __send_empty_flush()
1544 	ci->bio = &flush_bio;  in __send_empty_flush()
1545 	ci->sector_count = 0;  in __send_empty_flush()
1546 	ci->io->tio.clone.bi_iter.bi_size = 0;  in __send_empty_flush()
1548 	if (!t->flush_bypasses_map) {  in __send_empty_flush()
1549 		for (unsigned int i = 0; i < t->num_targets; i++) {  in __send_empty_flush()
1553 			if (unlikely(ti->num_flush_bios == 0))  in __send_empty_flush()
1556 			atomic_add(ti->num_flush_bios, &ci->io->io_count);  in __send_empty_flush()
1557 			bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,  in __send_empty_flush()
1559 			atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);  in __send_empty_flush()
1563 		 * Note that there's no need to grab t->devices_lock here  in __send_empty_flush()
1575 			 * used by multiple targets), so we set tio->ti = NULL.  in __send_empty_flush()
1576 			 * We must check for NULL in the I/O processing path, to  in __send_empty_flush()
1580 			atomic_add(1, &ci->io->io_count);  in __send_empty_flush()
1581 			bio_set_dev(clone, dd->dm_dev->bdev);  in __send_empty_flush()
1582 			clone->bi_end_io = clone_endio;  in __send_empty_flush()
1591 	atomic_sub(1, &ci->io->io_count);  in __send_empty_flush()
1593 	bio_uninit(ci->bio);  in __send_empty_flush()
1602 	len = min_t(sector_t, ci->sector_count,  in __send_abnormal_io()
1603 		    __max_io_len(ti, ci->sector, max_granularity, max_sectors));  in __send_abnormal_io()
1605 	atomic_add(num_bios, &ci->io->io_count);  in __send_abnormal_io()
1611 	atomic_sub(num_bios - bios + 1, &ci->io->io_count);  in __send_abnormal_io()
1613 	ci->sector += len;  in __send_abnormal_io()
1614 	ci->sector_count -= len;  in __send_abnormal_io()
1640 	struct queue_limits *limits = dm_get_queue_limits(ti->table->md);  in __process_abnormal_io()
1642 	switch (bio_op(ci->bio)) {  in __process_abnormal_io()
1644 		num_bios = ti->num_discard_bios;  in __process_abnormal_io()
1645 		max_sectors = limits->max_discard_sectors;  in __process_abnormal_io()
1646 		if (ti->max_discard_granularity)  in __process_abnormal_io()
1650 		num_bios = ti->num_secure_erase_bios;  in __process_abnormal_io()
1651 		max_sectors = limits->max_secure_erase_sectors;  in __process_abnormal_io()
1654 		num_bios = ti->num_write_zeroes_bios;  in __process_abnormal_io()
1655 		max_sectors = limits->max_write_zeroes_sectors;  in __process_abnormal_io()
1676  * Reuse ->bi_private as dm_io list head for storing all dm_io instances
1677  * associated with this bio, and this bio's bi_private needs to be
1678  * stored in dm_io->data before the reuse.
1680  * bio->bi_private is owned by fs or upper layer, so block layer won't
1686 	return (struct dm_io **)&bio->bi_private;  in dm_poll_list_head()
1693 	if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {  in dm_queue_poll_io()
1694 		bio->bi_opf |= REQ_DM_POLL_LIST;  in dm_queue_poll_io()
1699 		io->data = bio->bi_private;  in dm_queue_poll_io()
1701 		/* tell block layer to poll for completion */  in dm_queue_poll_io()
1702 		bio->bi_cookie = ~BLK_QC_T_NONE;  in dm_queue_poll_io()
1704 		io->next = NULL;  in dm_queue_poll_io()
1707 		 * bio recursed due to split, reuse original poll list,  in dm_queue_poll_io()
1708 		 * and save bio->bi_private too.  in dm_queue_poll_io()
1710 		io->data = (*head)->data;  in dm_queue_poll_io()
1711 		io->next = *head;  in dm_queue_poll_io()
1718  * Select the correct strategy for processing a non-flush bio.
1726 	ti = dm_table_find_target(ci->map, ci->sector);  in __split_and_process_bio()
1730 	if (unlikely(ci->is_abnormal_io))  in __split_and_process_bio()
1737 	ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);  in __split_and_process_bio()
1739 	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);  in __split_and_process_bio()
1740 	if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count)  in __split_and_process_bio()
1745 	if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) {  in __split_and_process_bio()
1746 		if (unlikely(!dm_target_supports_nowait(ti->type)))  in __split_and_process_bio()
1757 	ci->sector += len;  in __split_and_process_bio()
1758 	ci->sector_count -= len;  in __split_and_process_bio()
1766 	ci->map = map;  in init_clone_info()
1767 	ci->io = io;  in init_clone_info()
1768 	ci->bio = bio;  in init_clone_info()
1769 	ci->is_abnormal_io = is_abnormal;  in init_clone_info()
1770 	ci->submit_as_polled = false;  in init_clone_info()
1771 	ci->sector = bio->bi_iter.bi_sector;  in init_clone_info()
1772 	ci->sector_count = bio_sectors(bio);  in init_clone_info()
1774 	/* Shouldn't happen but sector_count was being set to 0 so... */  in init_clone_info()
1776 	    WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))  in init_clone_info()
1777 		ci->sector_count = 0;  in init_clone_info()
1798 	 * split any large BIO to the mapped device limits to not submit BIOs  in dm_zone_bio_needs_split()
1799 	 * that span zone boundaries and to avoid potential deadlocks with  in dm_zone_bio_needs_split()
1816 	struct mapped_device *md = ci->io->md;  in __send_zone_reset_all_emulated()
1817 	unsigned int zone_sectors = md->disk->queue->limits.chunk_sectors;  in __send_zone_reset_all_emulated()
1822 	sector_t sector = ti->begin;  in __send_zone_reset_all_emulated()
1826 	nr_zones = ti->len >> ilog2(zone_sectors);  in __send_zone_reset_all_emulated()
1831 	ret = dm_zone_get_reset_bitmap(md, ci->map, ti->begin,  in __send_zone_reset_all_emulated()
1838 	/* If we have no zone to reset, we are done. */  in __send_zone_reset_all_emulated()
1843 	atomic_add(nr_zones, &ci->io->io_count);  in __send_zone_reset_all_emulated()
1853 			/* This may take a while, so be nice to others */  in __send_zone_reset_all_emulated()
1858 			 * We may need to reset thousands of zones, so let's  in __send_zone_reset_all_emulated()
1865 		/* Get a clone and change it to a regular reset operation. */  in __send_zone_reset_all_emulated()
1867 		clone->bi_opf &= ~REQ_OP_MASK;  in __send_zone_reset_all_emulated()
1868 		clone->bi_opf |= REQ_OP_ZONE_RESET | REQ_SYNC;  in __send_zone_reset_all_emulated()
1869 		clone->bi_iter.bi_sector = sector;  in __send_zone_reset_all_emulated()
1870 		clone->bi_iter.bi_size = 0;  in __send_zone_reset_all_emulated()
1875 		nr_reset--;  in __send_zone_reset_all_emulated()
1879 	atomic_sub(nr_zones - num_bios, &ci->io->io_count);  in __send_zone_reset_all_emulated()
1880 	ci->sector_count = 0;  in __send_zone_reset_all_emulated()
1893 	atomic_add(1, &ci->io->io_count);  in __send_zone_reset_all_native()
1895 	atomic_sub(1 - bios, &ci->io->io_count);  in __send_zone_reset_all_native()
1897 	ci->sector_count = 0;  in __send_zone_reset_all_native()
1902 	struct dm_table *t = ci->map;  in __send_zone_reset_all()
1905 	for (unsigned int i = 0; i < t->num_targets; i++) {  in __send_zone_reset_all()
1908 		if (ti->zone_reset_all_supported) {  in __send_zone_reset_all()
1919 	atomic_sub(1, &ci->io->io_count);  in __send_zone_reset_all()
1940  * Entry point to split a bio into clones and submit them to the targets.
1962 		 * emulation to ensure that the BIO does not cross zone  in dm_split_and_process_bio()
1972 	 * need zone append emulation (e.g. dm-crypt).  in dm_split_and_process_bio()
1978 	if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {  in dm_split_and_process_bio()
1981 		 * multiple bios and there's no easy way how to undo the  in dm_split_and_process_bio()
1984 		if (bio->bi_opf & REQ_PREFLUSH) {  in dm_split_and_process_bio()
1990 			/* Unable to do anything without dm_io. */  in dm_split_and_process_bio()
1999 	if (bio->bi_opf & REQ_PREFLUSH) {  in dm_split_and_process_bio()
2015 	 * Remainder must be passed to submit_bio_noacct() so it gets handled  in dm_split_and_process_bio()
2018 	bio_trim(bio, io->sectors, ci.sector_count);  in dm_split_and_process_bio()
2019 	trace_block_split(bio, bio->bi_iter.bi_sector);  in dm_split_and_process_bio()
2024 	 * Drop the extra reference count for non-POLLED bio, and hold one  in dm_split_and_process_bio()
2028 	 * in bio->bi_private, so that dm_poll_bio can poll them all.  in dm_split_and_process_bio()
2036 			atomic_dec(&io->io_count);  in dm_split_and_process_bio()
2044 	struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;  in dm_submit_bio()
2057 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {  in dm_submit_bio()
2058 		if (bio->bi_opf & REQ_NOWAIT)  in dm_submit_bio()
2060 		else if (bio->bi_opf & REQ_RAHEAD)  in dm_submit_bio()
2075 	WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));  in dm_poll_dm_io()
2078 	if (atomic_read(&io->io_count) > 1)  in dm_poll_dm_io()
2079 		bio_poll(&io->tio.clone, iob, flags);  in dm_poll_dm_io()
2082 	return atomic_read(&io->io_count) == 1;  in dm_poll_dm_io()
2094 	if (!(bio->bi_opf & REQ_DM_POLL_LIST))  in dm_poll_bio()
2103 	 * submitted via submit_bio_noacct()'s depth-first submission.  in dm_poll_bio()
2107 	bio->bi_opf &= ~REQ_DM_POLL_LIST;  in dm_poll_bio()
2108 	bio->bi_private = list->data;  in dm_poll_bio()
2110 	for (curr = list, next = curr->next; curr; curr = next, next =  in dm_poll_bio()
2111 			curr ? curr->next : NULL) {  in dm_poll_bio()
2119 			curr->next = tmp;  in dm_poll_bio()
2126 		bio->bi_opf |= REQ_DM_POLL_LIST;  in dm_poll_bio()
2127 		/* Reset bio->bi_private to dm_io list head */  in dm_poll_bio()
2135  *---------------------------------------------------------------
2136  * An IDR is used to keep track of allocated minor numbers.
2137  *---------------------------------------------------------------
2154 		return -EINVAL;  in specific_minor()
2164 		return r == -ENOSPC ? -EBUSY : r;  in specific_minor()
2194 	dm_destroy_crypto_profile(q->crypto_profile);  in dm_queue_destroy_crypto_profile()
2206 	if (md->wq)  in cleanup_mapped_device()
2207 		destroy_workqueue(md->wq);  in cleanup_mapped_device()
2208 	dm_free_md_mempools(md->mempools);  in cleanup_mapped_device()
2210 	if (md->dax_dev) {  in cleanup_mapped_device()
2211 		dax_remove_host(md->disk);  in cleanup_mapped_device()
2212 		kill_dax(md->dax_dev);  in cleanup_mapped_device()
2213 		put_dax(md->dax_dev);  in cleanup_mapped_device()
2214 		md->dax_dev = NULL;  in cleanup_mapped_device()
2217 	if (md->disk) {  in cleanup_mapped_device()
2219 		md->disk->private_data = NULL;  in cleanup_mapped_device()
2225 			list_for_each_entry(td, &md->table_devices, list) {  in cleanup_mapped_device()
2226 				bd_unlink_disk_holder(td->dm_dev.bdev,  in cleanup_mapped_device()
2227 						      md->disk);  in cleanup_mapped_device()
2231 			 * Hold lock to make sure del_gendisk() won't concurrent  in cleanup_mapped_device()
2234 			mutex_lock(&md->table_devices_lock);  in cleanup_mapped_device()
2235 			del_gendisk(md->disk);  in cleanup_mapped_device()
2236 			mutex_unlock(&md->table_devices_lock);  in cleanup_mapped_device()
2238 		dm_queue_destroy_crypto_profile(md->queue);  in cleanup_mapped_device()
2239 		put_disk(md->disk);  in cleanup_mapped_device()
2242 	if (md->pending_io) {  in cleanup_mapped_device()
2243 		free_percpu(md->pending_io);  in cleanup_mapped_device()
2244 		md->pending_io = NULL;  in cleanup_mapped_device()
2247 	cleanup_srcu_struct(&md->io_barrier);  in cleanup_mapped_device()
2249 	mutex_destroy(&md->suspend_lock);  in cleanup_mapped_device()
2250 	mutex_destroy(&md->type_lock);  in cleanup_mapped_device()
2251 	mutex_destroy(&md->table_devices_lock);  in cleanup_mapped_device()
2252 	mutex_destroy(&md->swap_bios_lock);  in cleanup_mapped_device()
2269 		DMERR("unable to allocate device, out of memory.");  in alloc_dev()
2284 	r = init_srcu_struct(&md->io_barrier);  in alloc_dev()
2288 	md->numa_node_id = numa_node_id;  in alloc_dev()
2289 	md->init_tio_pdu = false;  in alloc_dev()
2290 	md->type = DM_TYPE_NONE;  in alloc_dev()
2291 	mutex_init(&md->suspend_lock);  in alloc_dev()
2292 	mutex_init(&md->type_lock);  in alloc_dev()
2293 	mutex_init(&md->table_devices_lock);  in alloc_dev()
2294 	spin_lock_init(&md->deferred_lock);  in alloc_dev()
2295 	atomic_set(&md->holders, 1);  in alloc_dev()
2296 	atomic_set(&md->open_count, 0);  in alloc_dev()
2297 	atomic_set(&md->event_nr, 0);  in alloc_dev()
2298 	atomic_set(&md->uevent_seq, 0);  in alloc_dev()
2299 	INIT_LIST_HEAD(&md->uevent_list);  in alloc_dev()
2300 	INIT_LIST_HEAD(&md->table_devices);  in alloc_dev()
2301 	spin_lock_init(&md->uevent_lock);  in alloc_dev()
2304 	 * default to bio-based until DM table is loaded and md->type  in alloc_dev()
2305 	 * established. If request-based table is loaded: blk-mq will  in alloc_dev()
2308 	md->disk = blk_alloc_disk(NULL, md->numa_node_id);  in alloc_dev()
2309 	if (IS_ERR(md->disk)) {  in alloc_dev()
2310 		md->disk = NULL;  in alloc_dev()
2313 	md->queue = md->disk->queue;  in alloc_dev()
2315 	init_waitqueue_head(&md->wait);  in alloc_dev()
2316 	INIT_WORK(&md->work, dm_wq_work);  in alloc_dev()
2317 	INIT_WORK(&md->requeue_work, dm_wq_requeue_work);  in alloc_dev()
2318 	init_waitqueue_head(&md->eventq);  in alloc_dev()
2319 	init_completion(&md->kobj_holder.completion);  in alloc_dev()
2321 	md->requeue_list = NULL;  in alloc_dev()
2322 	md->swap_bios = get_swap_bios();  in alloc_dev()
2323 	sema_init(&md->swap_bios_semaphore, md->swap_bios);  in alloc_dev()
2324 	mutex_init(&md->swap_bios_lock);  in alloc_dev()
2326 	md->disk->major = _major;  in alloc_dev()
2327 	md->disk->first_minor = minor;  in alloc_dev()
2328 	md->disk->minors = 1;  in alloc_dev()
2329 	md->disk->flags |= GENHD_FL_NO_PART;  in alloc_dev()
2330 	md->disk->fops = &dm_blk_dops;  in alloc_dev()
2331 	md->disk->private_data = md;  in alloc_dev()
2332 	sprintf(md->disk->disk_name, "dm-%d", minor);  in alloc_dev()
2336 		if (PTR_ERR(dax_dev) != -EOPNOTSUPP)  in alloc_dev()
2341 		md->dax_dev = dax_dev;  in alloc_dev()
2342 		if (dax_add_host(dax_dev, md->disk))  in alloc_dev()
2346 	format_dev_t(md->name, MKDEV(_major, minor));  in alloc_dev()
2348 	md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);  in alloc_dev()
2349 	if (!md->wq)  in alloc_dev()
2352 	md->pending_io = alloc_percpu(unsigned long);  in alloc_dev()
2353 	if (!md->pending_io)  in alloc_dev()
2356 	r = dm_stats_init(&md->stats);  in alloc_dev()
2384 	int minor = MINOR(disk_devt(md->disk));  in free_dev()
2390 	WARN_ON_ONCE(!list_empty(&md->table_devices));  in free_dev()
2391 	dm_stats_cleanup(&md->stats);  in free_dev()
2399  * Bind a table to the device.
2407 	spin_lock_irqsave(&md->uevent_lock, flags);  in event_callback()
2408 	list_splice_init(&md->uevent_list, &uevents);  in event_callback()
2409 	spin_unlock_irqrestore(&md->uevent_lock, flags);  in event_callback()
2411 	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);  in event_callback()
2413 	atomic_inc(&md->event_nr);  in event_callback()
2414 	wake_up(&md->eventq);  in event_callback()
2428 	lockdep_assert_held(&md->suspend_lock);  in __bind()
2435 		old_map = ERR_PTR(-EINVAL);  in __bind()
2439 	set_capacity(md->disk, size);  in __bind()
2441 	ret = dm_table_set_restrictions(t, md->queue, limits);  in __bind()
2443 		set_capacity(md->disk, old_size);  in __bind()
2452 		memset(&md->geometry, 0, sizeof(md->geometry));  in __bind()
2458 		 * Leverage the fact that request-based DM targets are  in __bind()
2459 		 * immutable singletons - used to optimize dm_mq_queue_rq.  in __bind()
2461 		md->immutable_target = dm_table_get_immutable_target(t);  in __bind()
2464 		 * There is no need to reload with request-based dm because the  in __bind()
2467 		 * Note for future: If you are to reload bioset, prep-ed  in __bind()
2468 		 * requests in the queue may refer to bio from the old bioset,  in __bind()
2469 		 * so you must walk through the queue to unprep.  in __bind()
2471 		if (!md->mempools)  in __bind()
2472 			md->mempools = t->mempools;  in __bind()
2474 			dm_free_md_mempools(t->mempools);  in __bind()
2481 		dm_free_md_mempools(md->mempools);  in __bind()
2482 		md->mempools = t->mempools;  in __bind()
2484 	t->mempools = NULL;  in __bind()
2486 	old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));  in __bind()
2487 	rcu_assign_pointer(md->map, (void *)t);  in __bind()
2488 	md->immutable_target_type = dm_table_get_immutable_target_type(t);  in __bind()
2497  * Returns unbound table for the caller to free.
2501 	struct dm_table *map = rcu_dereference_protected(md->map, 1);  in __unbind()
2507 	RCU_INIT_POINTER(md->map, NULL);  in __unbind()
2522 		return -ENXIO;  in dm_create()
2531  * Functions to manage md->type.
2532  * All are required to hold md->type_lock.
2536 	mutex_lock(&md->type_lock);  in dm_lock_md_type()
2541 	mutex_unlock(&md->type_lock);  in dm_unlock_md_type()
2546 	return md->type;  in dm_get_md_type()
2551 	return md->immutable_target_type;  in dm_get_immutable_target_type()
2567 		md->disk->fops = &dm_rq_blk_dops;  in dm_setup_md_queue()
2570 			DMERR("Cannot initialize queue for request-based dm mapped device");  in dm_setup_md_queue()
2580 	r = dm_table_set_restrictions(t, md->queue, &limits);  in dm_setup_md_queue()
2585 	 * Hold lock to make sure add_disk() and del_gendisk() won't concurrent  in dm_setup_md_queue()
2588 	mutex_lock(&md->table_devices_lock);  in dm_setup_md_queue()
2589 	r = add_disk(md->disk);  in dm_setup_md_queue()
2590 	mutex_unlock(&md->table_devices_lock);  in dm_setup_md_queue()
2595 	 * Register the holder relationship for devices added before the disk  in dm_setup_md_queue()
2598 	list_for_each_entry(td, &md->table_devices, list) {  in dm_setup_md_queue()
2599 		r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);  in dm_setup_md_queue()
2608 	md->type = type;  in dm_setup_md_queue()
2612 	list_for_each_entry_continue_reverse(td, &md->table_devices, list)  in dm_setup_md_queue()
2613 		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);  in dm_setup_md_queue()
2614 	mutex_lock(&md->table_devices_lock);  in dm_setup_md_queue()
2615 	del_gendisk(md->disk);  in dm_setup_md_queue()
2616 	mutex_unlock(&md->table_devices_lock);  in dm_setup_md_queue()
2632 	    test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {  in dm_get_md()
2646 	return md->interface_ptr;  in dm_get_mdptr()
2651 	md->interface_ptr = ptr;  in dm_set_mdptr()
2656 	atomic_inc(&md->holders);  in dm_get()
2657 	BUG_ON(test_bit(DMF_FREEING, &md->flags));  in dm_get()
2663 	if (test_bit(DMF_FREEING, &md->flags)) {  in dm_hold()
2665 		return -EBUSY;  in dm_hold()
2675 	return md->name;  in dm_device_name()
2688 	set_bit(DMF_FREEING, &md->flags);  in __dm_destroy()
2691 	blk_mark_disk_dead(md->disk);  in __dm_destroy()
2695 	 * do not race with internal suspend.  in __dm_destroy()
2697 	mutex_lock(&md->suspend_lock);  in __dm_destroy()
2701 		set_bit(DMF_SUSPENDED, &md->flags);  in __dm_destroy()
2702 		set_bit(DMF_POST_SUSPENDING, &md->flags);  in __dm_destroy()
2707 	mutex_unlock(&md->suspend_lock);  in __dm_destroy()
2710 	 * Rare, but there may be I/O requests still going to complete,  in __dm_destroy()
2711 	 * for example.  Wait for all references to disappear.  in __dm_destroy()
2716 		while (atomic_read(&md->holders))  in __dm_destroy()
2718 	else if (atomic_read(&md->holders))  in __dm_destroy()
2720 		       dm_device_name(md), atomic_read(&md->holders));  in __dm_destroy()
2738 	atomic_dec(&md->holders);  in dm_put()
2748 		sum += *per_cpu_ptr(md->pending_io, cpu);  in dm_in_flight_bios()
2759 		prepare_to_wait(&md->wait, &wait, task_state);  in dm_wait_for_bios_completion()
2765 			r = -ERESTARTSYS;  in dm_wait_for_bios_completion()
2771 	finish_wait(&md->wait, &wait);  in dm_wait_for_bios_completion()
2782 	if (!queue_is_mq(md->queue))  in dm_wait_for_completion()
2786 		if (!blk_mq_queue_inflight(md->queue))  in dm_wait_for_completion()
2790 			r = -ERESTARTSYS;  in dm_wait_for_completion()
2808 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {  in dm_wq_work()
2809 		spin_lock_irq(&md->deferred_lock);  in dm_wq_work()
2810 		bio = bio_list_pop(&md->deferred);  in dm_wq_work()
2811 		spin_unlock_irq(&md->deferred_lock);  in dm_wq_work()
2823 	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);  in dm_queue_flush()
2825 	queue_work(md->wq, &md->work);  in dm_queue_flush()
2829  * Swap in a new table, returning the old one for the caller to destroy.
2833 	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);  in dm_swap_table()
2837 	mutex_lock(&md->suspend_lock);  in dm_swap_table()
2852 			limits = md->queue->limits;  in dm_swap_table()
2868 	mutex_unlock(&md->suspend_lock);  in dm_swap_table()
2873  * Functions to lock and unlock any filesystem running on the
2880 	WARN_ON(test_bit(DMF_FROZEN, &md->flags));  in lock_fs()
2882 	r = bdev_freeze(md->disk->part0);  in lock_fs()
2884 		set_bit(DMF_FROZEN, &md->flags);  in lock_fs()
2890 	if (!test_bit(DMF_FROZEN, &md->flags))  in unlock_fs()
2892 	bdev_thaw(md->disk->part0);  in unlock_fs()
2893 	clear_bit(DMF_FROZEN, &md->flags);  in unlock_fs()
2902  * now. There is no request-processing activity. All new requests
2903  * are being added to md->deferred list.
2913 	lockdep_assert_held(&md->suspend_lock);  in __dm_suspend()
2920 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);  in __dm_suspend()
2931 	 * Flush I/O to the device.  in __dm_suspend()
2934 	 * (lock_fs() flushes I/Os and waits for them to complete.)  in __dm_suspend()
2946 	 * to target drivers i.e. no one may be executing  in __dm_suspend()
2949 	 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,  in __dm_suspend()
2950 	 * we take the write lock. To prevent any process from reentering  in __dm_suspend()
2953 	 * flush_workqueue(md->wq).  in __dm_suspend()
2955 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);  in __dm_suspend()
2957 		synchronize_srcu(&md->io_barrier);  in __dm_suspend()
2960 	 * Stop md->queue before flushing md->wq in case request-based  in __dm_suspend()
2961 	 * dm defers requests to md->wq from md->queue.  in __dm_suspend()
2964 		dm_stop_queue(md->queue);  in __dm_suspend()
2966 	flush_workqueue(md->wq);  in __dm_suspend()
2970 	 * We call dm_wait_for_completion to wait for all existing requests  in __dm_suspend()
2971 	 * to finish.  in __dm_suspend()
2975 		set_bit(dmf_suspended_flag, &md->flags);  in __dm_suspend()
2978 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);  in __dm_suspend()
2980 		synchronize_srcu(&md->io_barrier);  in __dm_suspend()
2987 			dm_start_queue(md->queue);  in __dm_suspend()
2998  * We need to be able to change a mapping table under a mounted
2999  * filesystem.  For example we might want to move some data in
3001  * dm_bind_table, dm_suspend must be called to flush any in
3005  * Suspend mechanism in request-based dm.
3009  * 3. Wait for all in-flight I/Os to be completed or requeued.
3011  * To abort suspend, start the request_queue.
3019 	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);  in dm_suspend()
3022 		r = -EINVAL;  in dm_suspend()
3028 		mutex_unlock(&md->suspend_lock);  in dm_suspend()
3029 		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);  in dm_suspend()
3035 	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));  in dm_suspend()
3045 	set_bit(DMF_POST_SUSPENDING, &md->flags);  in dm_suspend()
3047 	clear_bit(DMF_POST_SUSPENDING, &md->flags);  in dm_suspend()
3050 	mutex_unlock(&md->suspend_lock);  in dm_suspend()
3068 	 * Request-based dm is queueing the deferred I/Os in its request_queue.  in __dm_resume()
3071 		dm_start_queue(md->queue);  in __dm_resume()
3084 	r = -EINVAL;  in dm_resume()
3085 	mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);  in dm_resume()
3092 		mutex_unlock(&md->suspend_lock);  in dm_resume()
3093 		r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);  in dm_resume()
3099 	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));  in dm_resume()
3107 	clear_bit(DMF_SUSPENDED, &md->flags);  in dm_resume()
3109 	mutex_unlock(&md->suspend_lock);  in dm_resume()
3115  * Internal suspend/resume works like userspace-driven suspend. It waits
3116  * until all bios finish and prevents issuing new bios to the target drivers.
3124 	lockdep_assert_held(&md->suspend_lock);  in __dm_internal_suspend()
3126 	if (md->internal_suspend_count++)  in __dm_internal_suspend()
3127 		return; /* nested internal suspend */  in __dm_internal_suspend()
3130 		set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);  in __dm_internal_suspend()
3131 		return; /* nest suspend */  in __dm_internal_suspend()
3134 	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));  in __dm_internal_suspend()
3137 	 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is  in __dm_internal_suspend()
3138 	 * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend  in __dm_internal_suspend()
3139 	 * would require changing .presuspend to return an error -- avoid this  in __dm_internal_suspend()
3140 	 * until there is a need for more elaborate variants of internal suspend.  in __dm_internal_suspend()
3145 	set_bit(DMF_POST_SUSPENDING, &md->flags);  in __dm_internal_suspend()
3147 	clear_bit(DMF_POST_SUSPENDING, &md->flags);  in __dm_internal_suspend()
3155 	BUG_ON(!md->internal_suspend_count);  in __dm_internal_resume()
3157 	if (--md->internal_suspend_count)  in __dm_internal_resume()
3158 		return; /* resume from nested internal suspend */  in __dm_internal_resume()
3161 		goto done; /* resume from nested suspend */  in __dm_internal_resume()
3163 	map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));  in __dm_internal_resume()
3168 		 * tricky situation. We can't return an error to the caller. We  in __dm_internal_resume()
3174 		 * So, we fake normal suspend here, to make sure that the  in __dm_internal_resume()
3178 		set_bit(DMF_SUSPENDED, &md->flags);  in __dm_internal_resume()
3181 	clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);  in __dm_internal_resume()
3183 	wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);  in __dm_internal_resume()
3188 	mutex_lock(&md->suspend_lock);  in dm_internal_suspend_noflush()
3190 	mutex_unlock(&md->suspend_lock);  in dm_internal_suspend_noflush()
3196 	mutex_lock(&md->suspend_lock);  in dm_internal_resume()
3198 	mutex_unlock(&md->suspend_lock);  in dm_internal_resume()
3203  * Fast variants of internal suspend/resume hold md->suspend_lock,
3204  * which prevents interaction with userspace-driven suspend.
3209 	mutex_lock(&md->suspend_lock);  in dm_internal_suspend_fast()
3213 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);  in dm_internal_suspend_fast()
3214 	synchronize_srcu(&md->io_barrier);  in dm_internal_suspend_fast()
3215 	flush_workqueue(md->wq);  in dm_internal_suspend_fast()
3228 	mutex_unlock(&md->suspend_lock);  in dm_internal_resume_fast()
3233  *---------------------------------------------------------------
3235  *---------------------------------------------------------------
3256 	r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);  in dm_kobject_uevent()
3265 	return atomic_add_return(1, &md->uevent_seq);  in dm_next_uevent_seq()
3270 	return atomic_read(&md->event_nr);  in dm_get_event_nr()
3275 	return wait_event_interruptible(md->eventq,  in dm_wait_event()
3276 			(event_nr != atomic_read(&md->event_nr)));  in dm_wait_event()
3283 	spin_lock_irqsave(&md->uevent_lock, flags);  in dm_uevent_add()
3284 	list_add(elist, &md->uevent_list);  in dm_uevent_add()
3285 	spin_unlock_irqrestore(&md->uevent_lock, flags);  in dm_uevent_add()
3294 	return md->disk;  in dm_disk()
3300 	return &md->kobj_holder.kobj;  in dm_kobject()
3310 	if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {  in dm_get_from_kobject()
3323 	return test_bit(DMF_SUSPENDED, &md->flags);  in dm_suspended_md()
3328 	return test_bit(DMF_POST_SUSPENDING, &md->flags);  in dm_post_suspending_md()
3333 	return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);  in dm_suspended_internally_md()
3338 	return test_bit(DMF_DEFERRED_REMOVE, &md->flags);  in dm_test_deferred_remove_flag()
3343 	return dm_suspended_md(ti->table->md);  in dm_suspended()
3349 	return dm_post_suspending_md(ti->table->md);  in dm_post_suspending()
3355 	return __noflush_suspending(ti->table->md);  in dm_noflush_suspending()
3364 	bioset_exit(&pools->bs);  in dm_free_md_mempools()
3365 	bioset_exit(&pools->io_bs);  in dm_free_md_mempools()
3379 	const struct block_device_operations *fops = dev->bdev->bd_disk->fops;  in __dm_get_unique_id()
3381 	if (!fops->get_unique_id)  in __dm_get_unique_id()
3384 	return fops->get_unique_id(dev->bdev->bd_disk, dm_id->id, dm_id->type);  in __dm_get_unique_id()
3388  * Allow access to get_unique_id() for the first device returning a
3389  * non-zero result.  Reasonable use expects all devices to have the
3392 static int dm_blk_get_unique_id(struct gendisk *disk, u8 *id,  in dm_blk_get_unique_id()  argument
3395 	struct mapped_device *md = disk->private_data;  in dm_blk_get_unique_id()
3410 	if (table->num_targets != 1)  in dm_blk_get_unique_id()
3414 	if (!ti->type->iterate_devices)  in dm_blk_get_unique_id()
3417 	ret = ti->type->iterate_devices(ti, __dm_get_unique_id, &dm_id);  in dm_blk_get_unique_id()
3438 	struct mapped_device *md = bdev->bd_disk->private_data;  in dm_call_pr()
3441 	int ret = -ENOTTY, srcu_idx;  in dm_call_pr()
3448 	if (table->num_targets != 1)  in dm_call_pr()
3453 		ret = -EAGAIN;  in dm_call_pr()
3457 	ret = -EINVAL;  in dm_call_pr()
3458 	if (!ti->type->iterate_devices)  in dm_call_pr()
3461 	ti->type->iterate_devices(ti, fn, pr);  in dm_call_pr()
3469  * For register / unregister we need to manually call out to every path.
3475 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_register()
3478 	if (!ops || !ops->pr_register) {  in __dm_pr_register()
3479 		pr->ret = -EOPNOTSUPP;  in __dm_pr_register()
3480 		return -1;  in __dm_pr_register()
3483 	ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);  in __dm_pr_register()
3487 	if (!pr->ret)  in __dm_pr_register()
3488 		pr->ret = ret;  in __dm_pr_register()
3490 	if (pr->fail_early)  in __dm_pr_register()
3491 		return -1;  in __dm_pr_register()
3510 		/* Didn't even get to register a path */  in dm_pr_register()
3521 	/* unregister all paths if we failed to register any path */  in dm_pr_register()
3535 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_reserve()
3537 	if (!ops || !ops->pr_reserve) {  in __dm_pr_reserve()
3538 		pr->ret = -EOPNOTSUPP;  in __dm_pr_reserve()
3539 		return -1;  in __dm_pr_reserve()
3542 	pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags);  in __dm_pr_reserve()
3543 	if (!pr->ret)  in __dm_pr_reserve()
3544 		return -1;  in __dm_pr_reserve()
3569  * If there is a non-All Registrants type of reservation, the release must be
3572  * try each path to make sure we got the correct path.
3578 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_release()
3580 	if (!ops || !ops->pr_release) {  in __dm_pr_release()
3581 		pr->ret = -EOPNOTSUPP;  in __dm_pr_release()
3582 		return -1;  in __dm_pr_release()
3585 	pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type);  in __dm_pr_release()
3586 	if (pr->ret)  in __dm_pr_release()
3587 		return -1;  in __dm_pr_release()
3612 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_preempt()
3614 	if (!ops || !ops->pr_preempt) {  in __dm_pr_preempt()
3615 		pr->ret = -EOPNOTSUPP;  in __dm_pr_preempt()
3616 		return -1;  in __dm_pr_preempt()
3619 	pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type,  in __dm_pr_preempt()
3620 				  pr->abort);  in __dm_pr_preempt()
3621 	if (!pr->ret)  in __dm_pr_preempt()
3622 		return -1;  in __dm_pr_preempt()
3647 	struct mapped_device *md = bdev->bd_disk->private_data;  in dm_pr_clear()
3652 	/* Not a real ioctl, but targets must not interpret non-DM ioctls */  in dm_pr_clear()
3658 	ops = bdev->bd_disk->fops->pr_ops;  in dm_pr_clear()
3659 	if (ops && ops->pr_clear)  in dm_pr_clear()
3660 		r = ops->pr_clear(bdev, key);  in dm_pr_clear()
3662 		r = -EOPNOTSUPP;  in dm_pr_clear()
3672 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_read_keys()
3674 	if (!ops || !ops->pr_read_keys) {  in __dm_pr_read_keys()
3675 		pr->ret = -EOPNOTSUPP;  in __dm_pr_read_keys()
3676 		return -1;  in __dm_pr_read_keys()
3679 	pr->ret = ops->pr_read_keys(dev->bdev, pr->read_keys);  in __dm_pr_read_keys()
3680 	if (!pr->ret)  in __dm_pr_read_keys()
3681 		return -1;  in __dm_pr_read_keys()
3704 	const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;  in __dm_pr_read_reservation()
3706 	if (!ops || !ops->pr_read_reservation) {  in __dm_pr_read_reservation()
3707 		pr->ret = -EOPNOTSUPP;  in __dm_pr_read_reservation()
3708 		return -1;  in __dm_pr_read_reservation()
3711 	pr->ret = ops->pr_read_reservation(dev->bdev, pr->rsv);  in __dm_pr_read_reservation()
3712 	if (!pr->ret)  in __dm_pr_read_reservation()
3713 		return -1;  in __dm_pr_read_reservation()
3782 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3791 MODULE_AUTHOR("Joe Thornber <dm-devel@lists.linux.dev>");