xref: /src/sys/contrib/openzfs/module/zfs/zvol.c (revision 80aae8a3f8aa70712930664572be9e6885dc0be7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
24  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
25  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
26  * LLNL-CODE-403049.
27  *
28  * ZFS volume emulation driver.
29  *
30  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
31  * Volumes are accessed through the symbolic links named:
32  *
33  * /dev/<pool_name>/<dataset_name>
34  *
35  * Volumes are persistent through reboot and module load.  No user command
36  * needs to be run before opening and using a device.
37  *
38  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
39  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
40  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
41  * Copyright (c) 2024, 2025, Klara, Inc.
42  */
43 
44 /*
45  * Note on locking of zvol state structures.
46  *
47  * zvol_state_t represents the connection between a single dataset
48  * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
49  * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
50  *
51  * The global zvol_state_lock is used to protect access to zvol_state_list and
52  * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
53  * It should not be used for anything not name-relateds, and you should avoid
54  * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
55  * zvol_remove().
56  *
57  * The zv_state_lock is used to protect the contents of the associated
58  * zvol_state_t. Most of the zvol_state_t is dedicated to control and
59  * configuration; almost none of it is needed for data operations (that is,
60  * read, write, flush) so this lock is rarely taken during general IO. It
61  * should be released quickly; you should avoid sleeping or waiting while its
62  * held.
63  *
64  * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
65  * half should held for the duration of an IO operation. The write half should
66  * be taken when something to wait for IO to complete and the block further IO,
67  * eg for the duration of receive and rollback operations. This lock can be
68  * held for long periods of time.
69  *
70  * Thus, the following lock ordering appies.
71  * - take zvol_state_lock if necessary, to protect zvol_state_list
72  * - take zv_suspend_lock if necessary, by the code path in question
73  * - take zv_state_lock to protect zvol_state_t
74  *
75  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
76  * single-threaded (to preserve order of minor operations), and are executed
77  * through the zvol_task_cb that dispatches the specific operations. Therefore,
78  * these operations are serialized per pool. Consequently, we can be certain
79  * that for a given zvol, there is only one operation at a time in progress.
80  * That is why one can be sure that first, zvol_state_t for a given zvol is
81  * allocated and placed on zvol_state_list, and then other minor operations for
82  * this zvol are going to proceed in the order of issue.
83  */
84 
85 #include <sys/dataset_kstats.h>
86 #include <sys/dbuf.h>
87 #include <sys/dmu_traverse.h>
88 #include <sys/dsl_dataset.h>
89 #include <sys/dsl_prop.h>
90 #include <sys/dsl_dir.h>
91 #include <sys/zap.h>
92 #include <sys/zfeature.h>
93 #include <sys/zil_impl.h>
94 #include <sys/dmu_tx.h>
95 #include <sys/zio.h>
96 #include <sys/zfs_rlock.h>
97 #include <sys/spa_impl.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100 
101 unsigned int zvol_inhibit_dev = 0;
102 unsigned int zvol_prefetch_bytes = (128 * 1024);
103 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
104 unsigned int zvol_threads = 0;
105 unsigned int zvol_num_taskqs = 0;
106 unsigned int zvol_request_sync = 0;
107 
108 struct hlist_head *zvol_htable;
109 static list_t zvol_state_list;
110 krwlock_t zvol_state_lock;
111 extern int zfs_bclone_strict_properties;
112 extern int zfs_bclone_wait_dirty;
113 zv_taskq_t zvol_taskqs;
114 
115 typedef enum {
116 	ZVOL_ASYNC_CREATE_MINORS,
117 	ZVOL_ASYNC_REMOVE_MINORS,
118 	ZVOL_ASYNC_RENAME_MINORS,
119 	ZVOL_ASYNC_SET_SNAPDEV,
120 	ZVOL_ASYNC_SET_VOLMODE,
121 	ZVOL_ASYNC_MAX
122 } zvol_async_op_t;
123 
124 typedef struct {
125 	zvol_async_op_t zt_op;
126 	char zt_name1[MAXNAMELEN];
127 	char zt_name2[MAXNAMELEN];
128 	uint64_t zt_value;
129 	uint32_t zt_total;
130 	uint32_t zt_done;
131 	int32_t zt_status;
132 	int zt_error;
133 } zvol_task_t;
134 
135 zv_request_task_t *
zv_request_task_create(zv_request_t zvr)136 zv_request_task_create(zv_request_t zvr)
137 {
138 	zv_request_task_t *task;
139 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
140 	taskq_init_ent(&task->ent);
141 	task->zvr = zvr;
142 	return (task);
143 }
144 
145 void
zv_request_task_free(zv_request_task_t * task)146 zv_request_task_free(zv_request_task_t *task)
147 {
148 	kmem_free(task, sizeof (*task));
149 }
150 
151 uint64_t
zvol_name_hash(const char * name)152 zvol_name_hash(const char *name)
153 {
154 	uint64_t crc = -1ULL;
155 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
156 	for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++)
157 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
158 	return (crc);
159 }
160 
161 /*
162  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
163  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
164  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
165  * before zv_state_lock. The mode argument indicates the mode (including none)
166  * for zv_suspend_lock to be taken.
167  */
168 zvol_state_t *
zvol_find_by_name_hash(const char * name,uint64_t hash,int mode)169 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
170 {
171 	zvol_state_t *zv;
172 	struct hlist_node *p = NULL;
173 
174 	rw_enter(&zvol_state_lock, RW_READER);
175 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
176 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
177 		mutex_enter(&zv->zv_state_lock);
178 		if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) {
179 			/*
180 			 * this is the right zvol, take the locks in the
181 			 * right order
182 			 */
183 			if (mode != RW_NONE &&
184 			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
185 				mutex_exit(&zv->zv_state_lock);
186 				rw_enter(&zv->zv_suspend_lock, mode);
187 				mutex_enter(&zv->zv_state_lock);
188 				/*
189 				 * zvol cannot be renamed as we continue
190 				 * to hold zvol_state_lock
191 				 */
192 				ASSERT(zv->zv_hash == hash &&
193 				    strcmp(zv->zv_name, name) == 0);
194 			}
195 			rw_exit(&zvol_state_lock);
196 			return (zv);
197 		}
198 		mutex_exit(&zv->zv_state_lock);
199 	}
200 	rw_exit(&zvol_state_lock);
201 
202 	return (NULL);
203 }
204 
205 /*
206  * Find a zvol_state_t given the name.
207  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
208  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
209  * before zv_state_lock. The mode argument indicates the mode (including none)
210  * for zv_suspend_lock to be taken.
211  */
212 static zvol_state_t *
zvol_find_by_name(const char * name,int mode)213 zvol_find_by_name(const char *name, int mode)
214 {
215 	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
216 }
217 
218 /*
219  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
220  */
221 void
zvol_create_cb(objset_t * os,void * arg,cred_t * cr,dmu_tx_t * tx)222 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
223 {
224 	zfs_creat_t *zct = arg;
225 	nvlist_t *nvprops = zct->zct_props;
226 	int error;
227 	uint64_t volblocksize, volsize;
228 
229 	VERIFY0(nvlist_lookup_uint64(nvprops,
230 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize));
231 	if (nvlist_lookup_uint64(nvprops,
232 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
233 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
234 
235 	/*
236 	 * These properties must be removed from the list so the generic
237 	 * property setting step won't apply to them.
238 	 */
239 	VERIFY0(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)));
240 	(void) nvlist_remove_all(nvprops,
241 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
242 
243 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
244 	    DMU_OT_NONE, 0, tx);
245 	ASSERT0(error);
246 
247 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
248 	    DMU_OT_NONE, 0, tx);
249 	ASSERT0(error);
250 
251 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
252 	ASSERT0(error);
253 }
254 
255 /*
256  * ZFS_IOC_OBJSET_STATS entry point.
257  */
258 int
zvol_get_stats(objset_t * os,nvlist_t * nv)259 zvol_get_stats(objset_t *os, nvlist_t *nv)
260 {
261 	int error;
262 	dmu_object_info_t *doi;
263 	uint64_t val;
264 
265 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
266 	if (error)
267 		return (error);
268 
269 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
270 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
271 	error = dmu_object_info(os, ZVOL_OBJ, doi);
272 
273 	if (error == 0) {
274 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
275 		    doi->doi_data_block_size);
276 	}
277 
278 	kmem_free(doi, sizeof (dmu_object_info_t));
279 
280 	return (error);
281 }
282 
283 /*
284  * Sanity check volume size.
285  */
286 int
zvol_check_volsize(uint64_t volsize,uint64_t blocksize)287 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
288 {
289 	if (volsize == 0)
290 		return (SET_ERROR(EINVAL));
291 
292 	if (volsize % blocksize != 0)
293 		return (SET_ERROR(EINVAL));
294 
295 #ifdef _ILP32
296 	if (volsize - 1 > SPEC_MAXOFFSET_T)
297 		return (SET_ERROR(EOVERFLOW));
298 #endif
299 	return (0);
300 }
301 
302 /*
303  * Ensure the zap is flushed then inform the VFS of the capacity change.
304  */
305 static int
zvol_update_volsize(uint64_t volsize,objset_t * os)306 zvol_update_volsize(uint64_t volsize, objset_t *os)
307 {
308 	dmu_tx_t *tx;
309 	int error;
310 	uint64_t txg;
311 
312 	tx = dmu_tx_create(os);
313 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
314 	dmu_tx_mark_netfree(tx);
315 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
316 	if (error) {
317 		dmu_tx_abort(tx);
318 		return (error);
319 	}
320 	txg = dmu_tx_get_txg(tx);
321 
322 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
323 	    &volsize, tx);
324 	dmu_tx_commit(tx);
325 
326 	txg_wait_synced(dmu_objset_pool(os), txg);
327 
328 	if (error == 0)
329 		error = dmu_free_long_range(os,
330 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
331 
332 	return (error);
333 }
334 
335 /*
336  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
337  * size will result in a udev "change" event being generated.
338  */
339 int
zvol_set_volsize(const char * name,uint64_t volsize)340 zvol_set_volsize(const char *name, uint64_t volsize)
341 {
342 	objset_t *os = NULL;
343 	uint64_t readonly;
344 	int error;
345 	boolean_t owned = B_FALSE;
346 
347 	error = dsl_prop_get_integer(name,
348 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
349 	if (error != 0)
350 		return (error);
351 	if (readonly)
352 		return (SET_ERROR(EROFS));
353 
354 	zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
355 
356 	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
357 	    RW_READ_HELD(&zv->zv_suspend_lock)));
358 
359 	if (zv == NULL || zv->zv_objset == NULL) {
360 		if (zv != NULL)
361 			rw_exit(&zv->zv_suspend_lock);
362 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
363 		    FTAG, &os)) != 0) {
364 			if (zv != NULL)
365 				mutex_exit(&zv->zv_state_lock);
366 			return (error);
367 		}
368 		owned = B_TRUE;
369 		if (zv != NULL)
370 			zv->zv_objset = os;
371 	} else {
372 		os = zv->zv_objset;
373 	}
374 
375 	dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
376 
377 	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
378 	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
379 		goto out;
380 
381 	error = zvol_update_volsize(volsize, os);
382 	if (error == 0 && zv != NULL) {
383 		zv->zv_volsize = volsize;
384 		zv->zv_changed = 1;
385 	}
386 out:
387 	kmem_free(doi, sizeof (dmu_object_info_t));
388 
389 	if (owned) {
390 		dmu_objset_disown(os, B_TRUE, FTAG);
391 		if (zv != NULL)
392 			zv->zv_objset = NULL;
393 	} else {
394 		rw_exit(&zv->zv_suspend_lock);
395 	}
396 
397 	if (zv != NULL)
398 		mutex_exit(&zv->zv_state_lock);
399 
400 	if (error == 0 && zv != NULL)
401 		zvol_os_update_volsize(zv, volsize);
402 
403 	return (error);
404 }
405 
406 /*
407  * Update volthreading.
408  */
409 int
zvol_set_volthreading(const char * name,boolean_t value)410 zvol_set_volthreading(const char *name, boolean_t value)
411 {
412 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
413 	if (zv == NULL)
414 		return (-1);
415 	zv->zv_threading = value;
416 	mutex_exit(&zv->zv_state_lock);
417 	return (0);
418 }
419 
420 /*
421  * Update zvol ro property.
422  */
423 int
zvol_set_ro(const char * name,boolean_t value)424 zvol_set_ro(const char *name, boolean_t value)
425 {
426 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
427 	if (zv == NULL)
428 		return (-1);
429 	if (value) {
430 		zvol_os_set_disk_ro(zv, 1);
431 		zv->zv_flags |= ZVOL_RDONLY;
432 	} else {
433 		zvol_os_set_disk_ro(zv, 0);
434 		zv->zv_flags &= ~ZVOL_RDONLY;
435 	}
436 	mutex_exit(&zv->zv_state_lock);
437 	return (0);
438 }
439 
440 /*
441  * Sanity check volume block size.
442  */
443 int
zvol_check_volblocksize(const char * name,uint64_t volblocksize)444 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
445 {
446 	/* Record sizes above 128k need the feature to be enabled */
447 	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
448 		spa_t *spa;
449 		int error;
450 
451 		if ((error = spa_open(name, &spa, FTAG)) != 0)
452 			return (error);
453 
454 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
455 			spa_close(spa, FTAG);
456 			return (SET_ERROR(ENOTSUP));
457 		}
458 
459 		/*
460 		 * We don't allow setting the property above 1MB,
461 		 * unless the tunable has been changed.
462 		 */
463 		if (volblocksize > zfs_max_recordsize) {
464 			spa_close(spa, FTAG);
465 			return (SET_ERROR(EDOM));
466 		}
467 
468 		spa_close(spa, FTAG);
469 	}
470 
471 	if (volblocksize < SPA_MINBLOCKSIZE ||
472 	    volblocksize > SPA_MAXBLOCKSIZE ||
473 	    !ISP2(volblocksize))
474 		return (SET_ERROR(EDOM));
475 
476 	return (0);
477 }
478 
479 /*
480  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
481  * implement DKIOCFREE/free-long-range.
482  */
483 static int
zvol_replay_truncate(void * arg1,void * arg2,boolean_t byteswap)484 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
485 {
486 	zvol_state_t *zv = arg1;
487 	lr_truncate_t *lr = arg2;
488 	uint64_t offset, length;
489 
490 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
491 
492 	if (byteswap)
493 		byteswap_uint64_array(lr, sizeof (*lr));
494 
495 	offset = lr->lr_offset;
496 	length = lr->lr_length;
497 
498 	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
499 	dmu_tx_mark_netfree(tx);
500 	int error = dmu_tx_assign(tx, DMU_TX_WAIT);
501 	if (error != 0) {
502 		dmu_tx_abort(tx);
503 	} else {
504 		(void) zil_replaying(zv->zv_zilog, tx);
505 		dmu_tx_commit(tx);
506 		error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
507 		    length);
508 	}
509 
510 	return (error);
511 }
512 
513 /*
514  * Replay a TX_WRITE ZIL transaction that didn't get committed
515  * after a system failure
516  */
517 static int
zvol_replay_write(void * arg1,void * arg2,boolean_t byteswap)518 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
519 {
520 	zvol_state_t *zv = arg1;
521 	lr_write_t *lr = arg2;
522 	objset_t *os = zv->zv_objset;
523 	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
524 	uint64_t offset, length;
525 	dmu_tx_t *tx;
526 	int error;
527 
528 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
529 
530 	if (byteswap)
531 		byteswap_uint64_array(lr, sizeof (*lr));
532 
533 	offset = lr->lr_offset;
534 	length = lr->lr_length;
535 
536 	/* If it's a dmu_sync() block, write the whole block */
537 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
538 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
539 		if (length < blocksize) {
540 			offset -= offset % blocksize;
541 			length = blocksize;
542 		}
543 	}
544 
545 	tx = dmu_tx_create(os);
546 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
547 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
548 	if (error) {
549 		dmu_tx_abort(tx);
550 	} else {
551 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx,
552 		    DMU_READ_PREFETCH);
553 		(void) zil_replaying(zv->zv_zilog, tx);
554 		dmu_tx_commit(tx);
555 	}
556 
557 	return (error);
558 }
559 
560 /*
561  * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
562  * after a system failure
563  */
564 static int
zvol_replay_clone_range(void * arg1,void * arg2,boolean_t byteswap)565 zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
566 {
567 	zvol_state_t *zv = arg1;
568 	lr_clone_range_t *lr = arg2;
569 	objset_t *os = zv->zv_objset;
570 	dmu_tx_t *tx;
571 	int error;
572 	uint64_t blksz;
573 	uint64_t off;
574 	uint64_t len;
575 
576 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
577 	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
578 	    lr_bps[lr->lr_nbps]));
579 
580 	if (byteswap)
581 		byteswap_uint64_array(lr, sizeof (*lr));
582 
583 	ASSERT(spa_feature_is_enabled(dmu_objset_spa(os),
584 	    SPA_FEATURE_BLOCK_CLONING));
585 
586 	off = lr->lr_offset;
587 	len = lr->lr_length;
588 	blksz = lr->lr_blksz;
589 
590 	if ((off % blksz) != 0) {
591 		return (SET_ERROR(EINVAL));
592 	}
593 
594 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
595 	if (error != 0 || !zv->zv_dn)
596 		return (error);
597 	tx = dmu_tx_create(os);
598 	dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len, blksz);
599 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
600 	if (error != 0) {
601 		dmu_tx_abort(tx);
602 		goto out;
603 	}
604 	error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len,
605 	    tx, lr->lr_bps, lr->lr_nbps);
606 	if (error != 0) {
607 		dmu_tx_commit(tx);
608 		goto out;
609 	}
610 
611 	/*
612 	 * zil_replaying() not only check if we are replaying ZIL, but also
613 	 * updates the ZIL header to record replay progress.
614 	 */
615 	VERIFY(zil_replaying(zv->zv_zilog, tx));
616 	dmu_tx_commit(tx);
617 
618 out:
619 	dnode_rele(zv->zv_dn, zv);
620 	zv->zv_dn = NULL;
621 	return (error);
622 }
623 
624 int
zvol_clone_range(zvol_state_t * zv_src,uint64_t inoff,zvol_state_t * zv_dst,uint64_t outoff,uint64_t len)625 zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
626     uint64_t outoff, uint64_t len)
627 {
628 	zilog_t	*zilog_dst;
629 	zfs_locked_range_t *inlr, *outlr;
630 	objset_t *inos, *outos;
631 	dmu_tx_t *tx;
632 	blkptr_t *bps;
633 	size_t maxblocks;
634 	int error = 0;
635 
636 	rw_enter(&zv_dst->zv_suspend_lock, RW_READER);
637 	if (zv_dst->zv_zilog == NULL) {
638 		rw_exit(&zv_dst->zv_suspend_lock);
639 		rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER);
640 		if (zv_dst->zv_zilog == NULL) {
641 			zv_dst->zv_zilog = zil_open(zv_dst->zv_objset,
642 			    zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums);
643 			zv_dst->zv_flags |= ZVOL_WRITTEN_TO;
644 			VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags &
645 			    ZIL_REPLAY_NEEDED));
646 		}
647 		rw_downgrade(&zv_dst->zv_suspend_lock);
648 	}
649 	if (zv_src != zv_dst)
650 		rw_enter(&zv_src->zv_suspend_lock, RW_READER);
651 
652 	inos = zv_src->zv_objset;
653 	outos = zv_dst->zv_objset;
654 
655 	/*
656 	 * Sanity checks
657 	 */
658 	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
659 	    SPA_FEATURE_BLOCK_CLONING)) {
660 		error = SET_ERROR(EOPNOTSUPP);
661 		goto out;
662 	}
663 	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
664 		error = SET_ERROR(EXDEV);
665 		goto out;
666 	}
667 
668 	/*
669 	 * Block cloning from an unencrypted dataset into an encrypted
670 	 * dataset and vice versa is not supported.
671 	 */
672 	if (inos->os_encrypted != outos->os_encrypted) {
673 		error = SET_ERROR(EXDEV);
674 		goto out;
675 	}
676 
677 	/*
678 	 * Cloning across encrypted datasets is possible only if they
679 	 * share the same master key.
680 	 */
681 	if (inos != outos && inos->os_encrypted &&
682 	    !dmu_objset_crypto_key_equal(inos, outos)) {
683 		error = SET_ERROR(EXDEV);
684 		goto out;
685 	}
686 
687 	/*
688 	 * Cloning between datasets with different properties is possible,
689 	 * but it may cause confusions when copying data between them and
690 	 * expecting new properties to apply.
691 	 */
692 	if (zfs_bclone_strict_properties && inos != outos &&
693 	    !dmu_objset_is_snapshot(inos) &&
694 	    (inos->os_checksum != outos->os_checksum ||
695 	    inos->os_compress != outos->os_compress ||
696 	    inos->os_copies != outos->os_copies ||
697 	    inos->os_dedup_checksum != outos->os_dedup_checksum)) {
698 		error = SET_ERROR(EXDEV);
699 		goto out;
700 	}
701 
702 	if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) {
703 		error = SET_ERROR(EINVAL);
704 		goto out;
705 	}
706 
707 	/*
708 	 * Cloning between datasets with different special_small_blocks would
709 	 * bypass storage tier migration that would occur with a regular copy.
710 	 */
711 	if (zfs_bclone_strict_properties && inos != outos &&
712 	    !dmu_objset_is_snapshot(inos) &&
713 	    spa_has_special(dmu_objset_spa(inos))) {
714 		uint64_t in_smallblk = inos->os_zpl_special_smallblock;
715 		uint64_t out_smallblk = outos->os_zpl_special_smallblock;
716 		if (in_smallblk != out_smallblk) {
717 			uint64_t min_smallblk = MIN(in_smallblk, out_smallblk);
718 			uint64_t max_smallblk = MAX(in_smallblk, out_smallblk);
719 			if (min_smallblk < zv_src->zv_volblocksize &&
720 			    (inos->os_compress != ZIO_COMPRESS_OFF ||
721 			    max_smallblk >= zv_src->zv_volblocksize)) {
722 				error = SET_ERROR(EXDEV);
723 				goto out;
724 			}
725 		}
726 	}
727 
728 	if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) {
729 		goto out;
730 	}
731 
732 	/*
733 	 * Do not read beyond boundary
734 	 */
735 	if (len > zv_src->zv_volsize - inoff)
736 		len = zv_src->zv_volsize - inoff;
737 	if (len > zv_dst->zv_volsize - outoff)
738 		len = zv_dst->zv_volsize - outoff;
739 	if (len == 0)
740 		goto out;
741 
742 	/*
743 	 * Callers might not be able to detect properly that we are read-only,
744 	 * so check it explicitly here.
745 	 */
746 	if (zv_dst->zv_flags & ZVOL_RDONLY) {
747 		error = SET_ERROR(EROFS);
748 		goto out;
749 	}
750 
751 	/*
752 	 * No overlapping if we are cloning within the same file
753 	 */
754 	if (zv_src == zv_dst) {
755 		if (inoff < outoff + len && outoff < inoff + len) {
756 			error = SET_ERROR(EINVAL);
757 			goto out;
758 		}
759 	}
760 
761 	/*
762 	 * Offsets and length must be at block boundaries
763 	 */
764 	if ((inoff % zv_src->zv_volblocksize) != 0 ||
765 	    (outoff % zv_dst->zv_volblocksize) != 0) {
766 		error = SET_ERROR(EINVAL);
767 		goto out;
768 	}
769 
770 	/*
771 	 * Length must be multiple of block size
772 	 */
773 	if ((len % zv_src->zv_volblocksize) != 0) {
774 		error = SET_ERROR(EINVAL);
775 		goto out;
776 	}
777 
778 	zilog_dst = zv_dst->zv_zilog;
779 	maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) /
780 	    sizeof (bps[0]);
781 	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
782 	/*
783 	 * Maintain predictable lock order.
784 	 */
785 	if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) {
786 		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
787 		    RL_READER);
788 		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
789 		    RL_WRITER);
790 	} else {
791 		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
792 		    RL_WRITER);
793 		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
794 		    RL_READER);
795 	}
796 
797 	while (len > 0) {
798 		uint64_t size, last_synced_txg;
799 		size_t nbps = maxblocks;
800 		size = MIN(zv_src->zv_volblocksize * maxblocks, len);
801 		last_synced_txg = spa_last_synced_txg(
802 		    dmu_objset_spa(zv_src->zv_objset));
803 		error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff,
804 		    size, bps, &nbps);
805 		if (error != 0) {
806 			/*
807 			 * If we are trying to clone a block that was created
808 			 * in the current transaction group, the error will be
809 			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
810 			 * return a shortened range to the caller so it can
811 			 * fallback, or wait for the next TXG and check again.
812 			 */
813 			if (error == EAGAIN && zfs_bclone_wait_dirty) {
814 				txg_wait_synced(dmu_objset_pool
815 				    (zv_src->zv_objset), last_synced_txg + 1);
816 					continue;
817 			}
818 			break;
819 		}
820 
821 		tx = dmu_tx_create(zv_dst->zv_objset);
822 		dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size,
823 		    zv_src->zv_volblocksize);
824 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
825 		if (error != 0) {
826 			dmu_tx_abort(tx);
827 			break;
828 		}
829 		error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size,
830 		    tx, bps, nbps);
831 		if (error != 0) {
832 			dmu_tx_commit(tx);
833 			break;
834 		}
835 		zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff,
836 		    size, zv_src->zv_volblocksize, bps, nbps);
837 		dmu_tx_commit(tx);
838 		inoff += size;
839 		outoff += size;
840 		len -= size;
841 	}
842 	vmem_free(bps, sizeof (bps[0]) * maxblocks);
843 	zfs_rangelock_exit(outlr);
844 	zfs_rangelock_exit(inlr);
845 	if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) {
846 		error = zil_commit(zilog_dst, ZVOL_OBJ);
847 	}
848 out:
849 	if (zv_src != zv_dst)
850 		rw_exit(&zv_src->zv_suspend_lock);
851 	rw_exit(&zv_dst->zv_suspend_lock);
852 	return (error);
853 }
854 
855 /*
856  * Handles TX_CLONE_RANGE transactions.
857  */
858 void
zvol_log_clone_range(zilog_t * zilog,dmu_tx_t * tx,int txtype,uint64_t off,uint64_t len,uint64_t blksz,const blkptr_t * bps,size_t nbps)859 zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off,
860     uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps)
861 {
862 	itx_t *itx;
863 	lr_clone_range_t *lr;
864 	uint64_t partlen, max_log_data;
865 	size_t partnbps;
866 
867 	if (zil_replaying(zilog, tx))
868 		return;
869 
870 	max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
871 
872 	while (nbps > 0) {
873 		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
874 		partlen = partnbps * blksz;
875 		ASSERT3U(partlen, <, len + blksz);
876 		partlen = MIN(partlen, len);
877 
878 		itx = zil_itx_create(txtype,
879 		    sizeof (*lr) + sizeof (bps[0]) * partnbps);
880 		lr = (lr_clone_range_t *)&itx->itx_lr;
881 		lr->lr_foid = ZVOL_OBJ;
882 		lr->lr_offset = off;
883 		lr->lr_length = partlen;
884 		lr->lr_blksz = blksz;
885 		lr->lr_nbps = partnbps;
886 		memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
887 
888 		zil_itx_assign(zilog, itx, tx);
889 
890 		bps += partnbps;
891 		ASSERT3U(nbps, >=, partnbps);
892 		nbps -= partnbps;
893 		off += partlen;
894 		ASSERT3U(len, >=, partlen);
895 		len -= partlen;
896 	}
897 }
898 
899 static int
zvol_replay_err(void * arg1,void * arg2,boolean_t byteswap)900 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
901 {
902 	(void) arg1, (void) arg2, (void) byteswap;
903 	return (SET_ERROR(ENOTSUP));
904 }
905 
906 /*
907  * Callback vectors for replaying records.
908  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
909  */
910 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
911 	zvol_replay_err,	/* no such transaction type */
912 	zvol_replay_err,	/* TX_CREATE */
913 	zvol_replay_err,	/* TX_MKDIR */
914 	zvol_replay_err,	/* TX_MKXATTR */
915 	zvol_replay_err,	/* TX_SYMLINK */
916 	zvol_replay_err,	/* TX_REMOVE */
917 	zvol_replay_err,	/* TX_RMDIR */
918 	zvol_replay_err,	/* TX_LINK */
919 	zvol_replay_err,	/* TX_RENAME */
920 	zvol_replay_write,	/* TX_WRITE */
921 	zvol_replay_truncate,	/* TX_TRUNCATE */
922 	zvol_replay_err,	/* TX_SETATTR */
923 	zvol_replay_err,	/* TX_ACL_V0 */
924 	zvol_replay_err,	/* TX_ACL */
925 	zvol_replay_err,	/* TX_CREATE_ACL */
926 	zvol_replay_err,	/* TX_CREATE_ATTR */
927 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
928 	zvol_replay_err,	/* TX_MKDIR_ACL */
929 	zvol_replay_err,	/* TX_MKDIR_ATTR */
930 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
931 	zvol_replay_err,	/* TX_WRITE2 */
932 	zvol_replay_err,	/* TX_SETSAXATTR */
933 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
934 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
935 	zvol_replay_clone_range,	/* TX_CLONE_RANGE */
936 };
937 
938 /*
939  * zvol_log_write() handles TX_WRITE transactions.
940  */
941 void
zvol_log_write(zvol_state_t * zv,dmu_tx_t * tx,uint64_t offset,uint64_t size,boolean_t commit)942 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
943     uint64_t size, boolean_t commit)
944 {
945 	uint32_t blocksize = zv->zv_volblocksize;
946 	zilog_t *zilog = zv->zv_zilog;
947 	itx_wr_state_t write_state;
948 	uint64_t log_size = 0;
949 
950 	if (zil_replaying(zilog, tx))
951 		return;
952 
953 	write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit);
954 
955 	while (size) {
956 		itx_t *itx;
957 		lr_write_t *lr;
958 		itx_wr_state_t wr_state = write_state;
959 		ssize_t len = size;
960 
961 		if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
962 			wr_state = WR_NEED_COPY;
963 		else if (wr_state == WR_INDIRECT)
964 			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
965 
966 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
967 		    (wr_state == WR_COPIED ? len : 0));
968 		lr = (lr_write_t *)&itx->itx_lr;
969 		if (wr_state == WR_COPIED &&
970 		    dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
971 		    DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
972 			zil_itx_destroy(itx, 0);
973 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
974 			lr = (lr_write_t *)&itx->itx_lr;
975 			wr_state = WR_NEED_COPY;
976 		}
977 
978 		log_size += itx->itx_size;
979 		if (wr_state == WR_NEED_COPY)
980 			log_size += len;
981 
982 		itx->itx_wr_state = wr_state;
983 		lr->lr_foid = ZVOL_OBJ;
984 		lr->lr_offset = offset;
985 		lr->lr_length = len;
986 		lr->lr_blkoff = 0;
987 		BP_ZERO(&lr->lr_blkptr);
988 
989 		itx->itx_private = zv;
990 
991 		zil_itx_assign(zilog, itx, tx);
992 
993 		offset += len;
994 		size -= len;
995 	}
996 
997 	dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg);
998 }
999 
1000 /*
1001  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1002  */
1003 void
zvol_log_truncate(zvol_state_t * zv,dmu_tx_t * tx,uint64_t off,uint64_t len)1004 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len)
1005 {
1006 	itx_t *itx;
1007 	lr_truncate_t *lr;
1008 	zilog_t *zilog = zv->zv_zilog;
1009 
1010 	if (zil_replaying(zilog, tx))
1011 		return;
1012 
1013 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1014 	lr = (lr_truncate_t *)&itx->itx_lr;
1015 	lr->lr_foid = ZVOL_OBJ;
1016 	lr->lr_offset = off;
1017 	lr->lr_length = len;
1018 
1019 	zil_itx_assign(zilog, itx, tx);
1020 }
1021 
1022 
1023 static void
zvol_get_done(zgd_t * zgd,int error)1024 zvol_get_done(zgd_t *zgd, int error)
1025 {
1026 	(void) error;
1027 	if (zgd->zgd_db)
1028 		dmu_buf_rele(zgd->zgd_db, zgd);
1029 
1030 	zfs_rangelock_exit(zgd->zgd_lr);
1031 
1032 	kmem_free(zgd, sizeof (zgd_t));
1033 }
1034 
1035 /*
1036  * Get data to generate a TX_WRITE intent log record.
1037  */
1038 int
zvol_get_data(void * arg,uint64_t arg2,lr_write_t * lr,char * buf,struct lwb * lwb,zio_t * zio)1039 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
1040     struct lwb *lwb, zio_t *zio)
1041 {
1042 	zvol_state_t *zv = arg;
1043 	uint64_t offset = lr->lr_offset;
1044 	uint64_t size = lr->lr_length;
1045 	dmu_buf_t *db;
1046 	zgd_t *zgd;
1047 	int error;
1048 
1049 	ASSERT3P(lwb, !=, NULL);
1050 	ASSERT3U(size, !=, 0);
1051 
1052 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1053 	zgd->zgd_lwb = lwb;
1054 
1055 	/*
1056 	 * Write records come in two flavors: immediate and indirect.
1057 	 * For small writes it's cheaper to store the data with the
1058 	 * log record (immediate); for large writes it's cheaper to
1059 	 * sync the data and get a pointer to it (indirect) so that
1060 	 * we don't have to write the data twice.
1061 	 */
1062 	if (buf != NULL) { /* immediate write */
1063 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
1064 		    size, RL_READER);
1065 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
1066 		    DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
1067 	} else { /* indirect write */
1068 		ASSERT3P(zio, !=, NULL);
1069 		/*
1070 		 * Have to lock the whole block to ensure when it's written out
1071 		 * and its checksum is being calculated that no one can change
1072 		 * the data. Contrarily to zfs_get_data we need not re-check
1073 		 * blocksize after we get the lock because it cannot be changed.
1074 		 */
1075 		size = zv->zv_volblocksize;
1076 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
1077 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
1078 		    size, RL_READER);
1079 		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
1080 		    &db);
1081 		if (error == 0) {
1082 			blkptr_t *bp = &lr->lr_blkptr;
1083 
1084 			zgd->zgd_db = db;
1085 			zgd->zgd_bp = bp;
1086 
1087 			ASSERT(db != NULL);
1088 			ASSERT(db->db_offset == offset);
1089 			ASSERT(db->db_size == size);
1090 
1091 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1092 			    zvol_get_done, zgd);
1093 
1094 			if (error == 0)
1095 				return (0);
1096 		}
1097 	}
1098 
1099 	zvol_get_done(zgd, error);
1100 
1101 	return (error);
1102 }
1103 
1104 /*
1105  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
1106  */
1107 
1108 void
zvol_insert(zvol_state_t * zv)1109 zvol_insert(zvol_state_t *zv)
1110 {
1111 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
1112 	list_insert_head(&zvol_state_list, zv);
1113 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1114 }
1115 
1116 /*
1117  * Simply remove the zvol from to list of zvols.
1118  */
1119 static void
zvol_remove(zvol_state_t * zv)1120 zvol_remove(zvol_state_t *zv)
1121 {
1122 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
1123 	list_remove(&zvol_state_list, zv);
1124 	hlist_del(&zv->zv_hlink);
1125 }
1126 
1127 /*
1128  * Setup zv after we just own the zv->objset
1129  */
1130 static int
zvol_setup_zv(zvol_state_t * zv)1131 zvol_setup_zv(zvol_state_t *zv)
1132 {
1133 	uint64_t volsize;
1134 	int error;
1135 	uint64_t ro;
1136 	objset_t *os = zv->zv_objset;
1137 
1138 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1139 	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
1140 
1141 	zv->zv_zilog = NULL;
1142 	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
1143 
1144 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
1145 	if (error)
1146 		return (error);
1147 
1148 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1149 	if (error)
1150 		return (error);
1151 
1152 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
1153 	if (error)
1154 		return (error);
1155 
1156 	zvol_os_set_capacity(zv, volsize >> 9);
1157 	zv->zv_volsize = volsize;
1158 
1159 	if (ro || dmu_objset_is_snapshot(os) ||
1160 	    !spa_writeable(dmu_objset_spa(os))) {
1161 		zvol_os_set_disk_ro(zv, 1);
1162 		zv->zv_flags |= ZVOL_RDONLY;
1163 	} else {
1164 		zvol_os_set_disk_ro(zv, 0);
1165 		zv->zv_flags &= ~ZVOL_RDONLY;
1166 	}
1167 	return (0);
1168 }
1169 
1170 /*
1171  * Shutdown every zv_objset related stuff except zv_objset itself.
1172  * The is the reverse of zvol_setup_zv.
1173  */
1174 static void
zvol_shutdown_zv(zvol_state_t * zv)1175 zvol_shutdown_zv(zvol_state_t *zv)
1176 {
1177 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
1178 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
1179 
1180 	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
1181 		ASSERT(zv->zv_zilog != NULL);
1182 		zil_close(zv->zv_zilog);
1183 	}
1184 
1185 	zv->zv_zilog = NULL;
1186 
1187 	dnode_rele(zv->zv_dn, zv);
1188 	zv->zv_dn = NULL;
1189 
1190 	/*
1191 	 * Evict cached data. We must write out any dirty data before
1192 	 * disowning the dataset.
1193 	 */
1194 	if (zv->zv_flags & ZVOL_WRITTEN_TO)
1195 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1196 	dmu_objset_evict_dbufs(zv->zv_objset);
1197 }
1198 
1199 /*
1200  * return the proper tag for rollback and recv
1201  */
1202 void *
zvol_tag(zvol_state_t * zv)1203 zvol_tag(zvol_state_t *zv)
1204 {
1205 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
1206 	return (zv->zv_open_count > 0 ? zv : NULL);
1207 }
1208 
1209 /*
1210  * Suspend the zvol for recv and rollback.
1211  */
1212 int
zvol_suspend(const char * name,zvol_state_t ** zvp)1213 zvol_suspend(const char *name, zvol_state_t **zvp)
1214 {
1215 	zvol_state_t *zv;
1216 
1217 	zv = zvol_find_by_name(name, RW_WRITER);
1218 
1219 	if (zv == NULL)
1220 		return (SET_ERROR(ENOENT));
1221 
1222 	/* block all I/O, release in zvol_resume. */
1223 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1224 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
1225 
1226 	/*
1227 	 * If it's being removed, unlock and return error. It doesn't make any
1228 	 * sense to try to suspend a zvol being removed, but being here also
1229 	 * means that zvol_remove_minors_impl() is about to call zvol_remove()
1230 	 * and then destroy the zvol_state_t, so returning a pointer to it for
1231 	 * the caller to mess with would be a disaster anyway.
1232 	 */
1233 	if (zv->zv_flags & ZVOL_REMOVING) {
1234 		mutex_exit(&zv->zv_state_lock);
1235 		rw_exit(&zv->zv_suspend_lock);
1236 		/* NB: Returning EIO here to match zfsvfs_teardown() */
1237 		return (SET_ERROR(EIO));
1238 	}
1239 
1240 	atomic_inc(&zv->zv_suspend_ref);
1241 
1242 	if (zv->zv_open_count > 0)
1243 		zvol_shutdown_zv(zv);
1244 
1245 	/*
1246 	 * do not hold zv_state_lock across suspend/resume to
1247 	 * avoid locking up zvol lookups
1248 	 */
1249 	mutex_exit(&zv->zv_state_lock);
1250 
1251 	/* zv_suspend_lock is released in zvol_resume() */
1252 	*zvp = zv;
1253 	return (0);
1254 }
1255 
1256 int
zvol_resume(zvol_state_t * zv)1257 zvol_resume(zvol_state_t *zv)
1258 {
1259 	int error = 0;
1260 
1261 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
1262 
1263 	mutex_enter(&zv->zv_state_lock);
1264 
1265 	if (zv->zv_open_count > 0) {
1266 		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
1267 		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
1268 		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
1269 		dmu_objset_rele(zv->zv_objset, zv);
1270 
1271 		error = zvol_setup_zv(zv);
1272 	}
1273 
1274 	mutex_exit(&zv->zv_state_lock);
1275 
1276 	rw_exit(&zv->zv_suspend_lock);
1277 	/*
1278 	 * We need this because we don't hold zvol_state_lock while releasing
1279 	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
1280 	 * zv_suspend_lock to determine it is safe to free because rwlock is
1281 	 * not inherent atomic.
1282 	 */
1283 	atomic_dec(&zv->zv_suspend_ref);
1284 
1285 	if (zv->zv_flags & ZVOL_REMOVING)
1286 		cv_broadcast(&zv->zv_removing_cv);
1287 
1288 	return (error);
1289 }
1290 
1291 int
zvol_first_open(zvol_state_t * zv,boolean_t readonly)1292 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
1293 {
1294 	objset_t *os;
1295 	int error;
1296 
1297 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
1298 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1299 	ASSERT(spa_namespace_held());
1300 
1301 	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
1302 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
1303 	if (error)
1304 		return (error);
1305 
1306 	zv->zv_objset = os;
1307 
1308 	error = zvol_setup_zv(zv);
1309 	if (error) {
1310 		dmu_objset_disown(os, 1, zv);
1311 		zv->zv_objset = NULL;
1312 	}
1313 
1314 	return (error);
1315 }
1316 
1317 void
zvol_last_close(zvol_state_t * zv)1318 zvol_last_close(zvol_state_t *zv)
1319 {
1320 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
1321 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1322 
1323 	if (zv->zv_flags & ZVOL_REMOVING)
1324 		cv_broadcast(&zv->zv_removing_cv);
1325 
1326 	zvol_shutdown_zv(zv);
1327 
1328 	dmu_objset_disown(zv->zv_objset, 1, zv);
1329 	zv->zv_objset = NULL;
1330 }
1331 
1332 typedef struct minors_job {
1333 	list_t *list;
1334 	list_node_t link;
1335 	/* input */
1336 	char *name;
1337 	/* output */
1338 	int error;
1339 } minors_job_t;
1340 
1341 /*
1342  * Prefetch zvol dnodes for the minors_job
1343  */
1344 static void
zvol_prefetch_minors_impl(void * arg)1345 zvol_prefetch_minors_impl(void *arg)
1346 {
1347 	minors_job_t *job = arg;
1348 	char *dsname = job->name;
1349 	objset_t *os = NULL;
1350 
1351 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
1352 	    FTAG, &os);
1353 	if (job->error == 0) {
1354 		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
1355 		dmu_objset_disown(os, B_TRUE, FTAG);
1356 	}
1357 }
1358 
1359 /*
1360  * Mask errors to continue dmu_objset_find() traversal
1361  */
1362 static int
zvol_create_snap_minor_cb(const char * dsname,void * arg)1363 zvol_create_snap_minor_cb(const char *dsname, void *arg)
1364 {
1365 	minors_job_t *j = arg;
1366 	list_t *minors_list = j->list;
1367 	const char *name = j->name;
1368 
1369 	ASSERT0(spa_namespace_held());
1370 
1371 	/* skip the designated dataset */
1372 	if (name && strcmp(dsname, name) == 0)
1373 		return (0);
1374 
1375 	/* at this point, the dsname should name a snapshot */
1376 	if (strchr(dsname, '@') == 0) {
1377 		dprintf("zvol_create_snap_minor_cb(): "
1378 		    "%s is not a snapshot name\n", dsname);
1379 	} else {
1380 		minors_job_t *job;
1381 		char *n = kmem_strdup(dsname);
1382 		if (n == NULL)
1383 			return (0);
1384 
1385 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
1386 		job->name = n;
1387 		job->list = minors_list;
1388 		job->error = 0;
1389 		list_insert_tail(minors_list, job);
1390 		/* don't care if dispatch fails, because job->error is 0 */
1391 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
1392 		    TQ_SLEEP);
1393 	}
1394 
1395 	return (0);
1396 }
1397 
1398 /*
1399  * If spa_keystore_load_wkey() is called for an encrypted zvol,
1400  * we need to look for any clones also using the key. This function
1401  * is "best effort" - so we just skip over it if there are failures.
1402  */
1403 static void
zvol_add_clones(const char * dsname,list_t * minors_list)1404 zvol_add_clones(const char *dsname, list_t *minors_list)
1405 {
1406 	/* Also check if it has clones */
1407 	dsl_dir_t *dd = NULL;
1408 	dsl_pool_t *dp = NULL;
1409 
1410 	if (dsl_pool_hold(dsname, FTAG, &dp) != 0)
1411 		return;
1412 
1413 	if (!spa_feature_is_enabled(dp->dp_spa,
1414 	    SPA_FEATURE_ENCRYPTION))
1415 		goto out;
1416 
1417 	if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0)
1418 		goto out;
1419 
1420 	if (dsl_dir_phys(dd)->dd_clones == 0)
1421 		goto out;
1422 
1423 	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
1424 	zap_attribute_t *za = zap_attribute_alloc();
1425 	objset_t *mos = dd->dd_pool->dp_meta_objset;
1426 
1427 	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
1428 	    zap_cursor_retrieve(zc, za) == 0;
1429 	    zap_cursor_advance(zc)) {
1430 		dsl_dataset_t *clone;
1431 		minors_job_t *job;
1432 
1433 		if (dsl_dataset_hold_obj(dd->dd_pool,
1434 		    za->za_first_integer, FTAG, &clone) == 0) {
1435 
1436 			char name[ZFS_MAX_DATASET_NAME_LEN];
1437 			dsl_dataset_name(clone, name);
1438 
1439 			char *n = kmem_strdup(name);
1440 			job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
1441 			job->name = n;
1442 			job->list = minors_list;
1443 			job->error = 0;
1444 			list_insert_tail(minors_list, job);
1445 
1446 			dsl_dataset_rele(clone, FTAG);
1447 		}
1448 	}
1449 	zap_cursor_fini(zc);
1450 	zap_attribute_free(za);
1451 	kmem_free(zc, sizeof (zap_cursor_t));
1452 
1453 out:
1454 	if (dd != NULL)
1455 		dsl_dir_rele(dd, FTAG);
1456 	dsl_pool_rele(dp, FTAG);
1457 }
1458 
1459 /*
1460  * Mask errors to continue dmu_objset_find() traversal
1461  */
1462 static int
zvol_create_minors_cb(const char * dsname,void * arg)1463 zvol_create_minors_cb(const char *dsname, void *arg)
1464 {
1465 	uint64_t snapdev;
1466 	int error;
1467 	list_t *minors_list = arg;
1468 
1469 	ASSERT0(spa_namespace_held());
1470 
1471 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
1472 	if (error)
1473 		return (0);
1474 
1475 	/*
1476 	 * Given the name and the 'snapdev' property, create device minor nodes
1477 	 * with the linkages to zvols/snapshots as needed.
1478 	 * If the name represents a zvol, create a minor node for the zvol, then
1479 	 * check if its snapshots are 'visible', and if so, iterate over the
1480 	 * snapshots and create device minor nodes for those.
1481 	 */
1482 	if (strchr(dsname, '@') == 0) {
1483 		minors_job_t *job;
1484 		char *n = kmem_strdup(dsname);
1485 		if (n == NULL)
1486 			return (0);
1487 
1488 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
1489 		job->name = n;
1490 		job->list = minors_list;
1491 		job->error = 0;
1492 		list_insert_tail(minors_list, job);
1493 		/* don't care if dispatch fails, because job->error is 0 */
1494 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
1495 		    TQ_SLEEP);
1496 
1497 		zvol_add_clones(dsname, minors_list);
1498 
1499 		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
1500 			/*
1501 			 * traverse snapshots only, do not traverse children,
1502 			 * and skip the 'dsname'
1503 			 */
1504 			(void) dmu_objset_find(dsname,
1505 			    zvol_create_snap_minor_cb, (void *)job,
1506 			    DS_FIND_SNAPSHOTS);
1507 		}
1508 	} else {
1509 		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
1510 		    dsname);
1511 	}
1512 
1513 	return (0);
1514 }
1515 
1516 static void
zvol_task_update_status(zvol_task_t * task,uint64_t total,uint64_t done,int error)1517 zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done,
1518     int error)
1519 {
1520 
1521 	task->zt_total += total;
1522 	task->zt_done += done;
1523 	if (task->zt_total != task->zt_done) {
1524 		task->zt_status = -1;
1525 		if (error)
1526 			task->zt_error = error;
1527 	}
1528 }
1529 
1530 static void
zvol_task_report_status(zvol_task_t * task)1531 zvol_task_report_status(zvol_task_t *task)
1532 {
1533 #ifdef ZFS_DEBUG
1534 	static const char *const msg[] = {
1535 		"create",
1536 		"remove",
1537 		"rename",
1538 		"set snapdev",
1539 		"set volmode",
1540 		"unknown",
1541 	};
1542 
1543 	if (task->zt_status == 0)
1544 		return;
1545 
1546 	zvol_async_op_t op = MIN(task->zt_op, ZVOL_ASYNC_MAX);
1547 	if (task->zt_error) {
1548 		dprintf("The %s minors zvol task was not ok, last error %d\n",
1549 		    msg[op], task->zt_error);
1550 	} else {
1551 		dprintf("The %s minors zvol task was not ok\n", msg[op]);
1552 	}
1553 #else
1554 	(void) task;
1555 #endif
1556 }
1557 
1558 /*
1559  * Create minors for the specified dataset, including children and snapshots.
1560  * Pay attention to the 'snapdev' property and iterate over the snapshots
1561  * only if they are 'visible'. This approach allows one to assure that the
1562  * snapshot metadata is read from disk only if it is needed.
1563  *
1564  * The name can represent a dataset to be recursively scanned for zvols and
1565  * their snapshots, or a single zvol snapshot. If the name represents a
1566  * dataset, the scan is performed in two nested stages:
1567  * - scan the dataset for zvols, and
1568  * - for each zvol, create a minor node, then check if the zvol's snapshots
1569  *   are 'visible', and only then iterate over the snapshots if needed
1570  *
1571  * If the name represents a snapshot, a check is performed if the snapshot is
1572  * 'visible' (which also verifies that the parent is a zvol), and if so,
1573  * a minor node for that snapshot is created.
1574  */
1575 static void
zvol_create_minors_impl(zvol_task_t * task)1576 zvol_create_minors_impl(zvol_task_t *task)
1577 {
1578 	const char *name = task->zt_name1;
1579 	list_t minors_list;
1580 	minors_job_t *job;
1581 	uint64_t snapdev;
1582 	int total = 0, done = 0, last_error, error;
1583 
1584 	/*
1585 	 * Note: the dsl_pool_config_lock must not be held.
1586 	 * Minor node creation needs to obtain the zvol_state_lock.
1587 	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
1588 	 * config lock.  Therefore, we can't have the config lock now if
1589 	 * we are going to wait for the zvol_state_lock, because it
1590 	 * would be a lock order inversion which could lead to deadlock.
1591 	 */
1592 
1593 	if (zvol_inhibit_dev) {
1594 		return;
1595 	}
1596 
1597 	/*
1598 	 * This is the list for prefetch jobs. Whenever we found a match
1599 	 * during dmu_objset_find, we insert a minors_job to the list and do
1600 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
1601 	 * any lock because all list operation is done on the current thread.
1602 	 *
1603 	 * We will use this list to do zvol_os_create_minor after prefetch
1604 	 * so we don't have to traverse using dmu_objset_find again.
1605 	 */
1606 	list_create(&minors_list, sizeof (minors_job_t),
1607 	    offsetof(minors_job_t, link));
1608 
1609 
1610 	if (strchr(name, '@') != NULL) {
1611 		error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL);
1612 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) {
1613 			error = zvol_os_create_minor(name);
1614 			if (error == 0) {
1615 				done++;
1616 			} else {
1617 				last_error = error;
1618 			}
1619 			total++;
1620 		}
1621 	} else {
1622 		fstrans_cookie_t cookie = spl_fstrans_mark();
1623 		(void) dmu_objset_find(name, zvol_create_minors_cb,
1624 		    &minors_list, DS_FIND_CHILDREN);
1625 		spl_fstrans_unmark(cookie);
1626 	}
1627 
1628 	taskq_wait_outstanding(system_taskq, 0);
1629 
1630 	/*
1631 	 * Prefetch is completed, we can do zvol_os_create_minor
1632 	 * sequentially.
1633 	 */
1634 	while ((job = list_remove_head(&minors_list)) != NULL) {
1635 		if (!job->error) {
1636 			error = zvol_os_create_minor(job->name);
1637 			if (error == 0) {
1638 				done++;
1639 			} else {
1640 				last_error = error;
1641 			}
1642 		} else if (job->error == EINVAL) {
1643 			/*
1644 			 * The objset, with the name requested by current job
1645 			 * exist, but have the type different from zvol.
1646 			 * Just ignore this sort of errors.
1647 			 */
1648 			done++;
1649 		} else {
1650 			last_error = job->error;
1651 		}
1652 		total++;
1653 		kmem_strfree(job->name);
1654 		kmem_free(job, sizeof (minors_job_t));
1655 	}
1656 
1657 	list_destroy(&minors_list);
1658 	zvol_task_update_status(task, total, done, last_error);
1659 }
1660 
1661 /*
1662  * Remove minors for specified dataset and, optionally, its children and
1663  * snapshots.
1664  */
1665 static void
zvol_remove_minors_impl(zvol_task_t * task)1666 zvol_remove_minors_impl(zvol_task_t *task)
1667 {
1668 	zvol_state_t *zv, *zv_next;
1669 	const char *name = task ? task->zt_name1 : NULL;
1670 	int namelen = ((name) ? strlen(name) : 0);
1671 	boolean_t children = task ? !!task->zt_value : B_TRUE;
1672 
1673 	if (zvol_inhibit_dev)
1674 		return;
1675 
1676 	/*
1677 	 * We collect up zvols that we want to remove on a separate list, so
1678 	 * that we don't have to hold zvol_state_lock for the whole time.
1679 	 *
1680 	 * We can't remove them from the global lists until we're completely
1681 	 * done with them, because that would make them appear to ZFS-side ops
1682 	 * that they don't exist, and the name might be reused, which can't be
1683 	 * good.
1684 	 */
1685 	list_t remove_list;
1686 	list_create(&remove_list, sizeof (zvol_state_t),
1687 	    offsetof(zvol_state_t, zv_remove_node));
1688 
1689 	rw_enter(&zvol_state_lock, RW_READER);
1690 
1691 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1692 		zv_next = list_next(&zvol_state_list, zv);
1693 
1694 		mutex_enter(&zv->zv_state_lock);
1695 		if (zv->zv_flags & ZVOL_REMOVING) {
1696 			/* Another thread is handling shutdown, skip it. */
1697 			mutex_exit(&zv->zv_state_lock);
1698 			continue;
1699 		}
1700 
1701 		/*
1702 		 * This zvol should be removed if:
1703 		 * - no name was offered (ie removing all at shutdown); or
1704 		 * - name matches exactly; or
1705 		 * - we were asked to remove children, and
1706 		 *   - the start of the name matches, and
1707 		 *   - there is a '/' immediately after the matched name; or
1708 		 *   - there is a '@' immediately after the matched name
1709 		 */
1710 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
1711 		    (children && strncmp(zv->zv_name, name, namelen) == 0 &&
1712 		    (zv->zv_name[namelen] == '/' ||
1713 		    zv->zv_name[namelen] == '@'))) {
1714 
1715 			/*
1716 			 * Matched, so mark it removal. We want to take the
1717 			 * write half of the suspend lock to make sure that
1718 			 * the zvol is not suspended, and give any data ops
1719 			 * chance to finish.
1720 			 */
1721 			mutex_exit(&zv->zv_state_lock);
1722 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1723 			mutex_enter(&zv->zv_state_lock);
1724 
1725 			if (zv->zv_flags & ZVOL_REMOVING) {
1726 				/* Another thread has taken it, let them. */
1727 				mutex_exit(&zv->zv_state_lock);
1728 				rw_exit(&zv->zv_suspend_lock);
1729 				continue;
1730 			}
1731 
1732 			/*
1733 			 * Mark it and unlock. New entries will see the flag
1734 			 * and return ENXIO.
1735 			 */
1736 			zv->zv_flags |= ZVOL_REMOVING;
1737 			mutex_exit(&zv->zv_state_lock);
1738 			rw_exit(&zv->zv_suspend_lock);
1739 
1740 			/* Put it on the list for the next stage. */
1741 			list_insert_head(&remove_list, zv);
1742 		} else
1743 			mutex_exit(&zv->zv_state_lock);
1744 	}
1745 
1746 	rw_exit(&zvol_state_lock);
1747 
1748 	/* Didn't match any, nothing to do! */
1749 	if (list_is_empty(&remove_list)) {
1750 		if (task)
1751 			task->zt_error = SET_ERROR(ENOENT);
1752 		return;
1753 	}
1754 
1755 	/* Actually shut them all down. */
1756 	for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
1757 		zv_next = list_next(&remove_list, zv);
1758 
1759 		mutex_enter(&zv->zv_state_lock);
1760 
1761 		/*
1762 		 * Still open or suspended, just wait. This can happen if, for
1763 		 * example, we managed to acquire zv_state_lock in the moments
1764 		 * where zvol_open() or zvol_release() are trading locks to
1765 		 * call zvol_first_open() or zvol_last_close().
1766 		 */
1767 		while (zv->zv_open_count > 0 ||
1768 		    atomic_read(&zv->zv_suspend_ref))
1769 			cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
1770 
1771 		/*
1772 		 * No users, shut down the OS side. This may not remove the
1773 		 * minor from view immediately, depending on the kernel
1774 		 * specifics, but it will ensure that it is unusable and that
1775 		 * this zvol_state_t can never again be reached from an OS-side
1776 		 * operation.
1777 		 */
1778 		zvol_os_remove_minor(zv);
1779 		mutex_exit(&zv->zv_state_lock);
1780 
1781 		/* Remove it from the name lookup lists */
1782 		rw_enter(&zvol_state_lock, RW_WRITER);
1783 		zvol_remove(zv);
1784 		rw_exit(&zvol_state_lock);
1785 	}
1786 
1787 	/*
1788 	 * Our own references on remove_list is the last one, free them and
1789 	 * we're done.
1790 	 */
1791 	while ((zv = list_remove_head(&remove_list)) != NULL)
1792 		zvol_os_free(zv);
1793 
1794 	list_destroy(&remove_list);
1795 }
1796 
1797 /* Remove minor for this specific volume only */
1798 static int
zvol_remove_minor_impl(const char * name)1799 zvol_remove_minor_impl(const char *name)
1800 {
1801 	if (zvol_inhibit_dev)
1802 		return (0);
1803 
1804 	zvol_task_t task;
1805 	memset(&task, 0, sizeof (zvol_task_t));
1806 	strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
1807 	task.zt_value = B_FALSE;
1808 
1809 	zvol_remove_minors_impl(&task);
1810 
1811 	return (task.zt_error);
1812 }
1813 
1814 /*
1815  * Rename minors for specified dataset including children and snapshots.
1816  */
1817 static void
zvol_rename_minors_impl(zvol_task_t * task)1818 zvol_rename_minors_impl(zvol_task_t *task)
1819 {
1820 	zvol_state_t *zv, *zv_next;
1821 	const char *oldname = task->zt_name1;
1822 	const char *newname = task->zt_name2;
1823 	int total = 0, done = 0, last_error, error, oldnamelen;
1824 
1825 	if (zvol_inhibit_dev)
1826 		return;
1827 
1828 	oldnamelen = strlen(oldname);
1829 
1830 	rw_enter(&zvol_state_lock, RW_READER);
1831 
1832 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1833 		zv_next = list_next(&zvol_state_list, zv);
1834 
1835 		mutex_enter(&zv->zv_state_lock);
1836 
1837 		if (strcmp(zv->zv_name, oldname) == 0) {
1838 			error = zvol_os_rename_minor(zv, newname);
1839 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
1840 		    (zv->zv_name[oldnamelen] == '/' ||
1841 		    zv->zv_name[oldnamelen] == '@')) {
1842 			char *name = kmem_asprintf("%s%c%s", newname,
1843 			    zv->zv_name[oldnamelen],
1844 			    zv->zv_name + oldnamelen + 1);
1845 			error = zvol_os_rename_minor(zv, name);
1846 			kmem_strfree(name);
1847 		}
1848 		if (error) {
1849 			last_error = error;
1850 		} else {
1851 			done++;
1852 		}
1853 		total++;
1854 		mutex_exit(&zv->zv_state_lock);
1855 	}
1856 
1857 	rw_exit(&zvol_state_lock);
1858 	zvol_task_update_status(task, total, done, last_error);
1859 }
1860 
1861 typedef struct zvol_snapdev_cb_arg {
1862 	zvol_task_t *task;
1863 	uint64_t snapdev;
1864 } zvol_snapdev_cb_arg_t;
1865 
1866 static int
zvol_set_snapdev_cb(const char * dsname,void * param)1867 zvol_set_snapdev_cb(const char *dsname, void *param)
1868 {
1869 	zvol_snapdev_cb_arg_t *arg = param;
1870 	int error = 0;
1871 
1872 	if (strchr(dsname, '@') == NULL)
1873 		return (0);
1874 
1875 	switch (arg->snapdev) {
1876 		case ZFS_SNAPDEV_VISIBLE:
1877 			error = zvol_os_create_minor(dsname);
1878 			break;
1879 		case ZFS_SNAPDEV_HIDDEN:
1880 			error = zvol_remove_minor_impl(dsname);
1881 			break;
1882 	}
1883 
1884 	zvol_task_update_status(arg->task, 1, error == 0, error);
1885 	return (0);
1886 }
1887 
1888 static void
zvol_set_snapdev_impl(zvol_task_t * task)1889 zvol_set_snapdev_impl(zvol_task_t *task)
1890 {
1891 	const char *name = task->zt_name1;
1892 	uint64_t snapdev = task->zt_value;
1893 
1894 	zvol_snapdev_cb_arg_t arg = {task, snapdev};
1895 	fstrans_cookie_t cookie = spl_fstrans_mark();
1896 	/*
1897 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
1898 	 * in the dataset hierarchy. Here, we only scan snapshots.
1899 	 */
1900 	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
1901 	spl_fstrans_unmark(cookie);
1902 }
1903 
1904 static void
zvol_set_volmode_impl(zvol_task_t * task)1905 zvol_set_volmode_impl(zvol_task_t *task)
1906 {
1907 	const char *name = task->zt_name1;
1908 	uint64_t volmode = task->zt_value;
1909 	fstrans_cookie_t cookie;
1910 	uint64_t old_volmode;
1911 	zvol_state_t *zv;
1912 	int error;
1913 
1914 	if (strchr(name, '@') != NULL)
1915 		return;
1916 
1917 	/*
1918 	 * It's unfortunate we need to remove minors before we create new ones:
1919 	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
1920 	 * could be different when we set, for instance, volmode from "geom"
1921 	 * to "dev" (or vice versa).
1922 	 */
1923 	zv = zvol_find_by_name(name, RW_NONE);
1924 	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
1925 		return;
1926 	if (zv != NULL) {
1927 		old_volmode = zv->zv_volmode;
1928 		mutex_exit(&zv->zv_state_lock);
1929 		if (old_volmode == volmode)
1930 			return;
1931 		zvol_wait_close(zv);
1932 	}
1933 	cookie = spl_fstrans_mark();
1934 	switch (volmode) {
1935 		case ZFS_VOLMODE_NONE:
1936 			error = zvol_remove_minor_impl(name);
1937 			break;
1938 		case ZFS_VOLMODE_GEOM:
1939 		case ZFS_VOLMODE_DEV:
1940 			error = zvol_remove_minor_impl(name);
1941 			/*
1942 			 * The remove minor function call above, might be not
1943 			 * needed, if volmode was switched from 'none' value.
1944 			 * Ignore error in this case.
1945 			 */
1946 			if (error == ENOENT)
1947 				error = 0;
1948 			else if (error)
1949 				break;
1950 			error = zvol_os_create_minor(name);
1951 			break;
1952 		case ZFS_VOLMODE_DEFAULT:
1953 			error = zvol_remove_minor_impl(name);
1954 			if (zvol_volmode == ZFS_VOLMODE_NONE)
1955 				break;
1956 			else /* if zvol_volmode is invalid defaults to "geom" */
1957 				error = zvol_os_create_minor(name);
1958 			break;
1959 	}
1960 	zvol_task_update_status(task, 1, error == 0, error);
1961 	spl_fstrans_unmark(cookie);
1962 }
1963 
1964 /*
1965  * The worker thread function performed asynchronously.
1966  */
1967 static void
zvol_task_cb(void * arg)1968 zvol_task_cb(void *arg)
1969 {
1970 	zvol_task_t *task = arg;
1971 
1972 	switch (task->zt_op) {
1973 	case ZVOL_ASYNC_CREATE_MINORS:
1974 		zvol_create_minors_impl(task);
1975 		break;
1976 	case ZVOL_ASYNC_REMOVE_MINORS:
1977 		zvol_remove_minors_impl(task);
1978 		break;
1979 	case ZVOL_ASYNC_RENAME_MINORS:
1980 		zvol_rename_minors_impl(task);
1981 		break;
1982 	case ZVOL_ASYNC_SET_SNAPDEV:
1983 		zvol_set_snapdev_impl(task);
1984 		break;
1985 	case ZVOL_ASYNC_SET_VOLMODE:
1986 		zvol_set_volmode_impl(task);
1987 		break;
1988 	default:
1989 		VERIFY(0);
1990 		break;
1991 	}
1992 
1993 	zvol_task_report_status(task);
1994 	kmem_free(task, sizeof (zvol_task_t));
1995 }
1996 
1997 typedef struct zvol_set_prop_int_arg {
1998 	const char *zsda_name;
1999 	uint64_t zsda_value;
2000 	zprop_source_t zsda_source;
2001 	zfs_prop_t zsda_prop;
2002 } zvol_set_prop_int_arg_t;
2003 
2004 /*
2005  * Sanity check the dataset for safe use by the sync task.  No additional
2006  * conditions are imposed.
2007  */
2008 static int
zvol_set_common_check(void * arg,dmu_tx_t * tx)2009 zvol_set_common_check(void *arg, dmu_tx_t *tx)
2010 {
2011 	zvol_set_prop_int_arg_t *zsda = arg;
2012 	dsl_pool_t *dp = dmu_tx_pool(tx);
2013 	dsl_dir_t *dd;
2014 	int error;
2015 
2016 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
2017 	if (error != 0)
2018 		return (error);
2019 
2020 	dsl_dir_rele(dd, FTAG);
2021 
2022 	return (error);
2023 }
2024 
2025 static int
zvol_set_common_sync_cb(dsl_pool_t * dp,dsl_dataset_t * ds,void * arg)2026 zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2027 {
2028 	zvol_set_prop_int_arg_t *zsda = arg;
2029 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
2030 	zvol_task_t *task;
2031 	uint64_t prop;
2032 
2033 	const char *prop_name = zfs_prop_to_name(zsda->zsda_prop);
2034 	dsl_dataset_name(ds, dsname);
2035 
2036 	if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
2037 		return (0);
2038 
2039 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
2040 	if (zsda->zsda_prop == ZFS_PROP_VOLMODE) {
2041 		task->zt_op = ZVOL_ASYNC_SET_VOLMODE;
2042 	} else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) {
2043 		task->zt_op = ZVOL_ASYNC_SET_SNAPDEV;
2044 	} else {
2045 		kmem_free(task, sizeof (zvol_task_t));
2046 		return (0);
2047 	}
2048 	task->zt_value = prop;
2049 	strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1));
2050 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
2051 	    task, TQ_SLEEP);
2052 	return (0);
2053 }
2054 
2055 /*
2056  * Traverse all child datasets and apply the property appropriately.
2057  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
2058  * dataset and read the effective "property" on every child in the callback
2059  * function: this is because the value is not guaranteed to be the same in the
2060  * whole dataset hierarchy.
2061  */
2062 static void
zvol_set_common_sync(void * arg,dmu_tx_t * tx)2063 zvol_set_common_sync(void *arg, dmu_tx_t *tx)
2064 {
2065 	zvol_set_prop_int_arg_t *zsda = arg;
2066 	dsl_pool_t *dp = dmu_tx_pool(tx);
2067 	dsl_dir_t *dd;
2068 	dsl_dataset_t *ds;
2069 	int error;
2070 
2071 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
2072 
2073 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
2074 	if (error == 0) {
2075 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop),
2076 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
2077 		    &zsda->zsda_value, tx);
2078 		dsl_dataset_rele(ds, FTAG);
2079 	}
2080 
2081 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb,
2082 	    zsda, DS_FIND_CHILDREN);
2083 
2084 	dsl_dir_rele(dd, FTAG);
2085 }
2086 
2087 int
zvol_set_common(const char * ddname,zfs_prop_t prop,zprop_source_t source,uint64_t val)2088 zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
2089     uint64_t val)
2090 {
2091 	zvol_set_prop_int_arg_t zsda;
2092 
2093 	zsda.zsda_name = ddname;
2094 	zsda.zsda_source = source;
2095 	zsda.zsda_value = val;
2096 	zsda.zsda_prop = prop;
2097 
2098 	return (dsl_sync_task(ddname, zvol_set_common_check,
2099 	    zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
2100 }
2101 
2102 void
zvol_create_minors(const char * name)2103 zvol_create_minors(const char *name)
2104 {
2105 	spa_t *spa;
2106 	zvol_task_t *task;
2107 	taskqid_t id;
2108 
2109 	if (spa_open(name, &spa, FTAG) != 0)
2110 		return;
2111 
2112 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
2113 	task->zt_op = ZVOL_ASYNC_CREATE_MINORS;
2114 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
2115 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
2116 	if (id != TASKQID_INVALID)
2117 		taskq_wait_id(spa->spa_zvol_taskq, id);
2118 
2119 	spa_close(spa, FTAG);
2120 }
2121 
2122 void
zvol_remove_minors(spa_t * spa,const char * name,boolean_t async)2123 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
2124 {
2125 	zvol_task_t *task;
2126 	taskqid_t id;
2127 
2128 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
2129 	task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
2130 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
2131 	task->zt_value = B_TRUE;
2132 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
2133 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
2134 		taskq_wait_id(spa->spa_zvol_taskq, id);
2135 }
2136 
2137 void
zvol_rename_minors(spa_t * spa,const char * name1,const char * name2,boolean_t async)2138 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
2139     boolean_t async)
2140 {
2141 	zvol_task_t *task;
2142 	taskqid_t id;
2143 
2144 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
2145 	task->zt_op = ZVOL_ASYNC_RENAME_MINORS;
2146 	strlcpy(task->zt_name1, name1, sizeof (task->zt_name1));
2147 	strlcpy(task->zt_name2, name2, sizeof (task->zt_name2));
2148 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
2149 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
2150 		taskq_wait_id(spa->spa_zvol_taskq, id);
2151 }
2152 
2153 boolean_t
zvol_is_zvol(const char * name)2154 zvol_is_zvol(const char *name)
2155 {
2156 
2157 	return (zvol_os_is_zvol(name));
2158 }
2159 
2160 int
zvol_init_impl(void)2161 zvol_init_impl(void)
2162 {
2163 	int i;
2164 
2165 	/*
2166 	 * zvol_threads is the module param the user passes in.
2167 	 *
2168 	 * zvol_actual_threads is what we use internally, since the user can
2169 	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
2170 	 */
2171 	static unsigned int zvol_actual_threads;
2172 
2173 	if (zvol_threads == 0) {
2174 		/*
2175 		 * See dde9380a1 for why 32 was chosen here.  This should
2176 		 * probably be refined to be some multiple of the number
2177 		 * of CPUs.
2178 		 */
2179 		zvol_actual_threads = MAX(max_ncpus, 32);
2180 	} else {
2181 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
2182 	}
2183 
2184 	/*
2185 	 * Use at least 32 zvol_threads but for many core system,
2186 	 * prefer 6 threads per taskq, but no more taskqs
2187 	 * than threads in them on large systems.
2188 	 *
2189 	 *                 taskq   total
2190 	 * cpus    taskqs  threads threads
2191 	 * ------- ------- ------- -------
2192 	 * 1       1       32       32
2193 	 * 2       1       32       32
2194 	 * 4       1       32       32
2195 	 * 8       2       16       32
2196 	 * 16      3       11       33
2197 	 * 32      5       7        35
2198 	 * 64      8       8        64
2199 	 * 128     11      12       132
2200 	 * 256     16      16       256
2201 	 */
2202 	zv_taskq_t *ztqs = &zvol_taskqs;
2203 	int num_tqs = MIN(max_ncpus, zvol_num_taskqs);
2204 	if (num_tqs == 0) {
2205 		num_tqs = 1 + max_ncpus / 6;
2206 		while (num_tqs * num_tqs > zvol_actual_threads)
2207 			num_tqs--;
2208 	}
2209 
2210 	int per_tq_thread = zvol_actual_threads / num_tqs;
2211 	if (per_tq_thread * num_tqs < zvol_actual_threads)
2212 		per_tq_thread++;
2213 
2214 	ztqs->tqs_cnt = num_tqs;
2215 	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
2216 
2217 	for (uint_t i = 0; i < num_tqs; i++) {
2218 		char name[32];
2219 		(void) snprintf(name, sizeof (name), "%s_tq-%u",
2220 		    ZVOL_DRIVER, i);
2221 		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
2222 		    maxclsyspri, per_tq_thread, INT_MAX,
2223 		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
2224 		if (ztqs->tqs_taskq[i] == NULL) {
2225 			for (int j = i - 1; j >= 0; j--)
2226 				taskq_destroy(ztqs->tqs_taskq[j]);
2227 			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
2228 			    sizeof (taskq_t *));
2229 			ztqs->tqs_taskq = NULL;
2230 			return (SET_ERROR(ENOMEM));
2231 		}
2232 	}
2233 
2234 	list_create(&zvol_state_list, sizeof (zvol_state_t),
2235 	    offsetof(zvol_state_t, zv_next));
2236 	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
2237 
2238 	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
2239 	    KM_SLEEP);
2240 	for (i = 0; i < ZVOL_HT_SIZE; i++)
2241 		INIT_HLIST_HEAD(&zvol_htable[i]);
2242 
2243 	return (0);
2244 }
2245 
2246 void
zvol_fini_impl(void)2247 zvol_fini_impl(void)
2248 {
2249 	zv_taskq_t *ztqs = &zvol_taskqs;
2250 
2251 	zvol_remove_minors_impl(NULL);
2252 
2253 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
2254 	list_destroy(&zvol_state_list);
2255 	rw_destroy(&zvol_state_lock);
2256 
2257 	if (ztqs->tqs_taskq == NULL) {
2258 		ASSERT0(ztqs->tqs_cnt);
2259 	} else {
2260 		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
2261 			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
2262 			taskq_destroy(ztqs->tqs_taskq[i]);
2263 		}
2264 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
2265 		    sizeof (taskq_t *));
2266 		ztqs->tqs_taskq = NULL;
2267 	}
2268 }
2269 
2270 ZFS_MODULE_PARAM(zfs_vol, zvol_, inhibit_dev, UINT, ZMOD_RW,
2271 	"Do not create zvol device nodes");
2272 ZFS_MODULE_PARAM(zfs_vol, zvol_, prefetch_bytes, UINT, ZMOD_RW,
2273 	"Prefetch N bytes at zvol start+end");
2274 ZFS_MODULE_PARAM(zfs_vol, zvol_vol, mode, UINT, ZMOD_RW,
2275 	"Default volmode property value");
2276 ZFS_MODULE_PARAM(zfs_vol, zvol_, threads, UINT, ZMOD_RW,
2277 	"Number of threads for I/O requests. Set to 0 to use all active CPUs");
2278 ZFS_MODULE_PARAM(zfs_vol, zvol_, num_taskqs, UINT, ZMOD_RW,
2279 	"Number of zvol taskqs");
2280 ZFS_MODULE_PARAM(zfs_vol, zvol_, request_sync, UINT, ZMOD_RW,
2281 	"Synchronously handle bio requests");
2282