xref: /src/sys/contrib/openzfs/module/zfs/spa.c (revision 80aae8a3f8aa70712930664572be9e6885dc0be7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
26  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28  * Copyright 2013 Saso Kiselkov. All rights reserved.
29  * Copyright (c) 2014 Integros [integros.com]
30  * Copyright 2016 Toomas Soome <tsoome@me.com>
31  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
32  * Copyright 2018 Joyent, Inc.
33  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
34  * Copyright 2017 Joyent, Inc.
35  * Copyright (c) 2017, Intel Corporation.
36  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
37  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
38  * Copyright (c) 2023, 2024, Klara Inc.
39  */
40 
41 /*
42  * SPA: Storage Pool Allocator
43  *
44  * This file contains all the routines used when modifying on-disk SPA state.
45  * This includes opening, importing, destroying, exporting a pool, and syncing a
46  * pool.
47  */
48 
49 #include <sys/zfs_context.h>
50 #include <sys/fm/fs/zfs.h>
51 #include <sys/spa_impl.h>
52 #include <sys/zio.h>
53 #include <sys/zio_checksum.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_tx.h>
56 #include <sys/zap.h>
57 #include <sys/zil.h>
58 #include <sys/brt.h>
59 #include <sys/ddt.h>
60 #include <sys/vdev_impl.h>
61 #include <sys/vdev_removal.h>
62 #include <sys/vdev_indirect_mapping.h>
63 #include <sys/vdev_indirect_births.h>
64 #include <sys/vdev_initialize.h>
65 #include <sys/vdev_rebuild.h>
66 #include <sys/vdev_trim.h>
67 #include <sys/vdev_disk.h>
68 #include <sys/vdev_raidz.h>
69 #include <sys/vdev_draid.h>
70 #include <sys/metaslab.h>
71 #include <sys/metaslab_impl.h>
72 #include <sys/mmp.h>
73 #include <sys/uberblock_impl.h>
74 #include <sys/txg.h>
75 #include <sys/avl.h>
76 #include <sys/bpobj.h>
77 #include <sys/dmu_traverse.h>
78 #include <sys/dmu_objset.h>
79 #include <sys/unique.h>
80 #include <sys/dsl_pool.h>
81 #include <sys/dsl_dataset.h>
82 #include <sys/dsl_dir.h>
83 #include <sys/dsl_prop.h>
84 #include <sys/dsl_synctask.h>
85 #include <sys/fs/zfs.h>
86 #include <sys/arc.h>
87 #include <sys/callb.h>
88 #include <sys/systeminfo.h>
89 #include <sys/zfs_ioctl.h>
90 #include <sys/dsl_scan.h>
91 #include <sys/zfeature.h>
92 #include <sys/dsl_destroy.h>
93 #include <sys/zvol.h>
94 
95 #ifdef	_KERNEL
96 #include <sys/fm/protocol.h>
97 #include <sys/fm/util.h>
98 #include <sys/callb.h>
99 #include <sys/zone.h>
100 #include <sys/vmsystm.h>
101 #endif	/* _KERNEL */
102 
103 #include "zfs_crrd.h"
104 #include "zfs_prop.h"
105 #include "zfs_comutil.h"
106 #include <cityhash.h>
107 
108 /*
109  * spa_thread() existed on Illumos as a parent thread for the various worker
110  * threads that actually run the pool, as a way to both reference the entire
111  * pool work as a single object, and to share properties like scheduling
112  * options. It has not yet been adapted to Linux or FreeBSD. This define is
113  * used to mark related parts of the code to make things easier for the reader,
114  * and to compile this code out. It can be removed when someone implements it,
115  * moves it to some Illumos-specific place, or removes it entirely.
116  */
117 #undef HAVE_SPA_THREAD
118 
119 /*
120  * The "System Duty Cycle" scheduling class is an Illumos feature to help
121  * prevent CPU-intensive kernel threads from affecting latency on interactive
122  * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
123  * gated behind a define. On Illumos SDC depends on spa_thread(), but
124  * spa_thread() also has other uses, so this is a separate define.
125  */
126 #undef HAVE_SYSDC
127 
128 /*
129  * The interval, in seconds, at which failed configuration cache file writes
130  * should be retried.
131  */
132 int zfs_ccw_retry_interval = 300;
133 
134 typedef enum zti_modes {
135 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
136 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
137 	ZTI_MODE_SYNC,			/* sync thread assigned */
138 	ZTI_MODE_NULL,			/* don't create a taskq */
139 	ZTI_NMODES
140 } zti_modes_t;
141 
142 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
143 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
144 #define	ZTI_SCALE(min)	{ ZTI_MODE_SCALE, (min), 1 }
145 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
146 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
147 
148 #define	ZTI_N(n)	ZTI_P(n, 1)
149 #define	ZTI_ONE		ZTI_N(1)
150 
151 typedef struct zio_taskq_info {
152 	zti_modes_t zti_mode;
153 	uint_t zti_value;
154 	uint_t zti_count;
155 } zio_taskq_info_t;
156 
157 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
158 	"iss", "iss_h", "int", "int_h"
159 };
160 
161 /*
162  * This table defines the taskq settings for each ZFS I/O type. When
163  * initializing a pool, we use this table to create an appropriately sized
164  * taskq. Some operations are low volume and therefore have a small, static
165  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
166  * macros. Other operations process a large amount of data; the ZTI_SCALE
167  * macro causes us to create a taskq oriented for throughput. Some operations
168  * are so high frequency and short-lived that the taskq itself can become a
169  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
170  * additional degree of parallelism specified by the number of threads per-
171  * taskq and the number of taskqs; when dispatching an event in this case, the
172  * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
173  * that scales with the number of CPUs.
174  *
175  * The different taskq priorities are to handle the different contexts (issue
176  * and interrupt) and then to reserve threads for high priority I/Os that
177  * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
178  * implementation, so separate high priority threads are used there.
179  */
180 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
181 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
182 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
183 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* READ */
184 #ifdef illumos
185 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE(0),	ZTI_N(5) }, /* WRITE */
186 #else
187 	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* WRITE */
188 #endif
189 	{ ZTI_SCALE(32), ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
190 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
191 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
192 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
193 };
194 
195 static void spa_sync_version(void *arg, dmu_tx_t *tx);
196 static void spa_sync_props(void *arg, dmu_tx_t *tx);
197 static boolean_t spa_has_active_shared_spare(spa_t *spa);
198 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
199     const char **ereport);
200 static void spa_vdev_resilver_done(spa_t *spa);
201 
202 /*
203  * Percentage of all CPUs that can be used by the metaslab preload taskq.
204  */
205 static uint_t metaslab_preload_pct = 50;
206 
207 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
208 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
209 
210 #ifdef HAVE_SYSDC
211 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
212 static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
213 #endif
214 
215 #ifdef HAVE_SPA_THREAD
216 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
217 #endif
218 
219 static uint_t	zio_taskq_write_tpq = 16;
220 
221 /*
222  * Report any spa_load_verify errors found, but do not fail spa_load.
223  * This is used by zdb to analyze non-idle pools.
224  */
225 boolean_t	spa_load_verify_dryrun = B_FALSE;
226 
227 /*
228  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
229  * This is used by zdb for spacemaps verification.
230  */
231 boolean_t	spa_mode_readable_spacemaps = B_FALSE;
232 
233 /*
234  * This (illegal) pool name is used when temporarily importing a spa_t in order
235  * to get the vdev stats associated with the imported devices.
236  */
237 #define	TRYIMPORT_NAME	"$import"
238 
239 /*
240  * For debugging purposes: print out vdev tree during pool import.
241  */
242 static int		spa_load_print_vdev_tree = B_FALSE;
243 
244 /*
245  * A non-zero value for zfs_max_missing_tvds means that we allow importing
246  * pools with missing top-level vdevs. This is strictly intended for advanced
247  * pool recovery cases since missing data is almost inevitable. Pools with
248  * missing devices can only be imported read-only for safety reasons, and their
249  * fail-mode will be automatically set to "continue".
250  *
251  * With 1 missing vdev we should be able to import the pool and mount all
252  * datasets. User data that was not modified after the missing device has been
253  * added should be recoverable. This means that snapshots created prior to the
254  * addition of that device should be completely intact.
255  *
256  * With 2 missing vdevs, some datasets may fail to mount since there are
257  * dataset statistics that are stored as regular metadata. Some data might be
258  * recoverable if those vdevs were added recently.
259  *
260  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
261  * may be missing entirely. Chances of data recovery are very low. Note that
262  * there are also risks of performing an inadvertent rewind as we might be
263  * missing all the vdevs with the latest uberblocks.
264  */
265 uint64_t	zfs_max_missing_tvds = 0;
266 
267 /*
268  * The parameters below are similar to zfs_max_missing_tvds but are only
269  * intended for a preliminary open of the pool with an untrusted config which
270  * might be incomplete or out-dated.
271  *
272  * We are more tolerant for pools opened from a cachefile since we could have
273  * an out-dated cachefile where a device removal was not registered.
274  * We could have set the limit arbitrarily high but in the case where devices
275  * are really missing we would want to return the proper error codes; we chose
276  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
277  * and we get a chance to retrieve the trusted config.
278  */
279 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
280 
281 /*
282  * In the case where config was assembled by scanning device paths (/dev/dsks
283  * by default) we are less tolerant since all the existing devices should have
284  * been detected and we want spa_load to return the right error codes.
285  */
286 uint64_t	zfs_max_missing_tvds_scan = 0;
287 
288 /*
289  * Debugging aid that pauses spa_sync() towards the end.
290  */
291 static const boolean_t	zfs_pause_spa_sync = B_FALSE;
292 
293 /*
294  * Variables to indicate the livelist condense zthr func should wait at certain
295  * points for the livelist to be removed - used to test condense/destroy races
296  */
297 static int zfs_livelist_condense_zthr_pause = 0;
298 static int zfs_livelist_condense_sync_pause = 0;
299 
300 /*
301  * Variables to track whether or not condense cancellation has been
302  * triggered in testing.
303  */
304 static int zfs_livelist_condense_sync_cancel = 0;
305 static int zfs_livelist_condense_zthr_cancel = 0;
306 
307 /*
308  * Variable to track whether or not extra ALLOC blkptrs were added to a
309  * livelist entry while it was being condensed (caused by the way we track
310  * remapped blkptrs in dbuf_remap_impl)
311  */
312 static int zfs_livelist_condense_new_alloc = 0;
313 
314 /*
315  * Time variable to decide how often the txg should be added into the
316  * database (in seconds).
317  * The smallest available resolution is in minutes, which means an update occurs
318  * each time we reach `spa_note_txg_time` and the txg has changed. We provide
319  * a 256-slot ring buffer for minute-level resolution. The number is limited by
320  * the size of the structure we use and the maximum amount of bytes we can write
321  * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
322  * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
323  * high-resolution data.
324  *
325  * The user can decrease `spa_note_txg_time` to increase resolution within
326  * a day, at the cost of retaining fewer days of data. Alternatively, increasing
327  * the interval allows storing data over a longer period, but with lower
328  * frequency.
329  *
330  * This parameter does not affect the daily or monthly databases, as those only
331  * store one record per day and per month, respectively.
332  */
333 static uint_t spa_note_txg_time = 10 * 60;
334 
335 /*
336  * How often flush txg database to a disk (in seconds).
337  * We flush data every time we write to it, making it the most reliable option.
338  * Since this happens every 10 minutes, it shouldn't introduce any noticeable
339  * overhead for the system. In case of failure, we will always have an
340  * up-to-date version of the database.
341  *
342  * The user can adjust the flush interval to a lower value, but it probably
343  * doesn't make sense to flush more often than the database is updated.
344  * The user can also increase the interval if they're concerned about the
345  * performance of writing the entire database to disk.
346  */
347 static uint_t spa_flush_txg_time = 10 * 60;
348 
349 /*
350  * ==========================================================================
351  * SPA properties routines
352  * ==========================================================================
353  */
354 
355 /*
356  * Add a (source=src, propname=propval) list to an nvlist.
357  */
358 static void
spa_prop_add_list(nvlist_t * nvl,zpool_prop_t prop,const char * strval,uint64_t intval,zprop_source_t src)359 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
360     uint64_t intval, zprop_source_t src)
361 {
362 	const char *propname = zpool_prop_to_name(prop);
363 	nvlist_t *propval;
364 
365 	propval = fnvlist_alloc();
366 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
367 
368 	if (strval != NULL)
369 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
370 	else
371 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
372 
373 	fnvlist_add_nvlist(nvl, propname, propval);
374 	nvlist_free(propval);
375 }
376 
377 static int
spa_prop_add(spa_t * spa,const char * propname,nvlist_t * outnvl)378 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl)
379 {
380 	zpool_prop_t prop = zpool_name_to_prop(propname);
381 	zprop_source_t src = ZPROP_SRC_NONE;
382 	uint64_t intval;
383 	int err;
384 
385 	/*
386 	 * NB: Not all properties lookups via this API require
387 	 * the spa props lock, so they must explicitly grab it here.
388 	 */
389 	switch (prop) {
390 	case ZPOOL_PROP_DEDUPCACHED:
391 		err = ddt_get_pool_dedup_cached(spa, &intval);
392 		if (err != 0)
393 			return (SET_ERROR(err));
394 		break;
395 	default:
396 		return (SET_ERROR(EINVAL));
397 	}
398 
399 	spa_prop_add_list(outnvl, prop, NULL, intval, src);
400 
401 	return (0);
402 }
403 
404 int
spa_prop_get_nvlist(spa_t * spa,char ** props,unsigned int n_props,nvlist_t * outnvl)405 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
406     nvlist_t *outnvl)
407 {
408 	int err = 0;
409 
410 	if (props == NULL)
411 		return (0);
412 
413 	for (unsigned int i = 0; i < n_props && err == 0; i++) {
414 		err = spa_prop_add(spa, props[i], outnvl);
415 	}
416 
417 	return (err);
418 }
419 
420 /*
421  * Add metaslab class properties to an nvlist.
422  */
423 static void
spa_prop_add_metaslab_class(nvlist_t * nv,metaslab_class_t * mc,zpool_mc_props_t mcp,uint64_t * sizep,uint64_t * allocp,uint64_t * usablep,uint64_t * usedp)424 spa_prop_add_metaslab_class(nvlist_t *nv, metaslab_class_t *mc,
425     zpool_mc_props_t mcp, uint64_t *sizep, uint64_t *allocp, uint64_t *usablep,
426     uint64_t *usedp)
427 {
428 	uint64_t size = metaslab_class_get_space(mc);
429 	uint64_t alloc = metaslab_class_get_alloc(mc);
430 	uint64_t dsize = metaslab_class_get_dspace(mc);
431 	uint64_t dalloc = metaslab_class_get_dalloc(mc);
432 	uint64_t cap = (size == 0) ? 0 : (alloc * 100 / size);
433 	const zprop_source_t src = ZPROP_SRC_NONE;
434 
435 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_SIZE, NULL, size, src);
436 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_ALLOCATED, NULL, alloc, src);
437 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_USABLE, NULL, dsize, src);
438 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_USED, NULL, dalloc, src);
439 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_FRAGMENTATION, NULL,
440 	    metaslab_class_fragmentation(mc), src);
441 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_EXPANDSZ, NULL,
442 	    metaslab_class_expandable_space(mc), src);
443 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_FREE, NULL, size - alloc,
444 	    src);
445 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_AVAILABLE, NULL,
446 	    dsize - dalloc, src);
447 	spa_prop_add_list(nv, mcp + ZPOOL_MC_PROP_CAPACITY, NULL, cap, src);
448 	if (sizep != NULL)
449 		*sizep += size;
450 	if (allocp != NULL)
451 		*allocp += alloc;
452 	if (usablep != NULL)
453 		*usablep += dsize;
454 	if (usedp != NULL)
455 		*usedp += dalloc;
456 }
457 
458 /*
459  * Add a user property (source=src, propname=propval) to an nvlist.
460  */
461 static void
spa_prop_add_user(nvlist_t * nvl,const char * propname,char * strval,zprop_source_t src)462 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
463     zprop_source_t src)
464 {
465 	nvlist_t *propval;
466 
467 	VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP));
468 	VERIFY0(nvlist_add_uint64(propval, ZPROP_SOURCE, src));
469 	VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, strval));
470 	VERIFY0(nvlist_add_nvlist(nvl, propname, propval));
471 	nvlist_free(propval);
472 }
473 
474 /*
475  * Get property values from the spa configuration.
476  */
477 static void
spa_prop_get_config(spa_t * spa,nvlist_t * nv)478 spa_prop_get_config(spa_t *spa, nvlist_t *nv)
479 {
480 	vdev_t *rvd = spa->spa_root_vdev;
481 	dsl_pool_t *pool = spa->spa_dsl_pool;
482 	uint64_t size, alloc, usable, used, cap, version;
483 	const zprop_source_t src = ZPROP_SRC_NONE;
484 	spa_config_dirent_t *dp;
485 	metaslab_class_t *mc = spa_normal_class(spa);
486 
487 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
488 
489 	if (rvd != NULL) {
490 		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
491 
492 		size = alloc = usable = used = 0;
493 		spa_prop_add_metaslab_class(nv, mc, ZPOOL_MC_PROPS_NORMAL,
494 		    &size, &alloc, &usable, &used);
495 		spa_prop_add_metaslab_class(nv, spa_special_class(spa),
496 		    ZPOOL_MC_PROPS_SPECIAL, &size, &alloc, &usable, &used);
497 		spa_prop_add_metaslab_class(nv, spa_dedup_class(spa),
498 		    ZPOOL_MC_PROPS_DEDUP, &size, &alloc, &usable, &used);
499 		spa_prop_add_metaslab_class(nv, spa_log_class(spa),
500 		    ZPOOL_MC_PROPS_LOG, NULL, NULL, NULL, NULL);
501 		spa_prop_add_metaslab_class(nv, spa_embedded_log_class(spa),
502 		    ZPOOL_MC_PROPS_ELOG, &size, &alloc, &usable, &used);
503 		spa_prop_add_metaslab_class(nv,
504 		    spa_special_embedded_log_class(spa), ZPOOL_MC_PROPS_SELOG,
505 		    &size, &alloc, &usable, &used);
506 
507 		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
508 		spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
509 		spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL,
510 		    size - alloc, src);
511 		spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL,
512 		    metaslab_class_fragmentation(mc), src);
513 		spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL,
514 		    metaslab_class_expandable_space(mc), src);
515 		cap = (size == 0) ? 0 : (alloc * 100 / size);
516 		spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src);
517 		spa_prop_add_list(nv, ZPOOL_PROP_AVAILABLE, NULL, usable - used,
518 		    src);
519 		spa_prop_add_list(nv, ZPOOL_PROP_USABLE, NULL, usable, src);
520 		spa_prop_add_list(nv, ZPOOL_PROP_USED, NULL, used, src);
521 
522 		spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL,
523 		    spa->spa_checkpoint_info.sci_dspace, src);
524 		spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL,
525 		    (spa_mode(spa) == SPA_MODE_READ), src);
526 
527 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL,
528 		    ddt_get_pool_dedup_ratio(spa), src);
529 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPUSED, NULL,
530 		    ddt_get_dedup_used(spa), src);
531 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPSAVED, NULL,
532 		    ddt_get_dedup_saved(spa), src);
533 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL,
534 		    brt_get_used(spa), src);
535 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL,
536 		    brt_get_saved(spa), src);
537 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL,
538 		    brt_get_ratio(spa), src);
539 
540 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
541 		    ddt_get_ddt_dsize(spa), src);
542 		spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL,
543 		    rvd->vdev_state, src);
544 		spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL,
545 		    spa_get_last_scrubbed_txg(spa), src);
546 
547 		version = spa_version(spa);
548 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
549 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
550 			    version, ZPROP_SRC_DEFAULT);
551 		} else {
552 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
553 			    version, ZPROP_SRC_LOCAL);
554 		}
555 		spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID,
556 		    NULL, spa_load_guid(spa), src);
557 	}
558 
559 	if (pool != NULL) {
560 		/*
561 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
562 		 * when opening pools before this version freedir will be NULL.
563 		 */
564 		if (pool->dp_free_dir != NULL) {
565 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL,
566 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
567 			    src);
568 		} else {
569 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING,
570 			    NULL, 0, src);
571 		}
572 
573 		if (pool->dp_leak_dir != NULL) {
574 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL,
575 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
576 			    src);
577 		} else {
578 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED,
579 			    NULL, 0, src);
580 		}
581 	}
582 
583 	spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
584 
585 	if (spa->spa_comment != NULL) {
586 		spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment,
587 		    0, ZPROP_SRC_LOCAL);
588 	}
589 
590 	if (spa->spa_compatibility != NULL) {
591 		spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY,
592 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
593 	}
594 
595 	if (spa->spa_root != NULL)
596 		spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root,
597 		    0, ZPROP_SRC_LOCAL);
598 
599 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
600 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
601 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
602 	} else {
603 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
604 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
605 	}
606 
607 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
608 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
609 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
610 	} else {
611 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
612 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
613 	}
614 
615 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
616 		if (dp->scd_path == NULL) {
617 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
618 			    "none", 0, ZPROP_SRC_LOCAL);
619 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
620 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
621 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
622 		}
623 	}
624 }
625 
626 /*
627  * Get zpool property values.
628  */
629 int
spa_prop_get(spa_t * spa,nvlist_t * nv)630 spa_prop_get(spa_t *spa, nvlist_t *nv)
631 {
632 	objset_t *mos = spa->spa_meta_objset;
633 	zap_cursor_t zc;
634 	zap_attribute_t *za;
635 	dsl_pool_t *dp;
636 	int err = 0;
637 
638 	dp = spa_get_dsl(spa);
639 	dsl_pool_config_enter(dp, FTAG);
640 	za = zap_attribute_alloc();
641 	mutex_enter(&spa->spa_props_lock);
642 
643 	/*
644 	 * Get properties from the spa config.
645 	 */
646 	spa_prop_get_config(spa, nv);
647 
648 	/* If no pool property object, no more prop to get. */
649 	if (mos == NULL || spa->spa_pool_props_object == 0)
650 		goto out;
651 
652 	/*
653 	 * Get properties from the MOS pool property object.
654 	 */
655 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
656 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
657 	    zap_cursor_advance(&zc)) {
658 		uint64_t intval = 0;
659 		char *strval = NULL;
660 		zprop_source_t src = ZPROP_SRC_DEFAULT;
661 		zpool_prop_t prop;
662 
663 		if ((prop = zpool_name_to_prop(za->za_name)) ==
664 		    ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name))
665 			continue;
666 
667 		switch (za->za_integer_length) {
668 		case 8:
669 			/* integer property */
670 			if (za->za_first_integer !=
671 			    zpool_prop_default_numeric(prop))
672 				src = ZPROP_SRC_LOCAL;
673 
674 			if (prop == ZPOOL_PROP_BOOTFS) {
675 				dsl_dataset_t *ds = NULL;
676 
677 				err = dsl_dataset_hold_obj(dp,
678 				    za->za_first_integer, FTAG, &ds);
679 				if (err != 0)
680 					break;
681 
682 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
683 				    KM_SLEEP);
684 				dsl_dataset_name(ds, strval);
685 				dsl_dataset_rele(ds, FTAG);
686 			} else {
687 				strval = NULL;
688 				intval = za->za_first_integer;
689 			}
690 
691 			spa_prop_add_list(nv, prop, strval, intval, src);
692 
693 			if (strval != NULL)
694 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
695 
696 			break;
697 
698 		case 1:
699 			/* string property */
700 			strval = kmem_alloc(za->za_num_integers, KM_SLEEP);
701 			err = zap_lookup(mos, spa->spa_pool_props_object,
702 			    za->za_name, 1, za->za_num_integers, strval);
703 			if (err) {
704 				kmem_free(strval, za->za_num_integers);
705 				break;
706 			}
707 			if (prop != ZPOOL_PROP_INVAL) {
708 				spa_prop_add_list(nv, prop, strval, 0, src);
709 			} else {
710 				src = ZPROP_SRC_LOCAL;
711 				spa_prop_add_user(nv, za->za_name, strval,
712 				    src);
713 			}
714 			kmem_free(strval, za->za_num_integers);
715 			break;
716 
717 		default:
718 			break;
719 		}
720 	}
721 	zap_cursor_fini(&zc);
722 out:
723 	mutex_exit(&spa->spa_props_lock);
724 	dsl_pool_config_exit(dp, FTAG);
725 	zap_attribute_free(za);
726 
727 	if (err && err != ENOENT)
728 		return (err);
729 
730 	return (0);
731 }
732 
733 /*
734  * Validate the given pool properties nvlist and modify the list
735  * for the property values to be set.
736  */
737 static int
spa_prop_validate(spa_t * spa,nvlist_t * props)738 spa_prop_validate(spa_t *spa, nvlist_t *props)
739 {
740 	nvpair_t *elem;
741 	int error = 0, reset_bootfs = 0;
742 	uint64_t objnum = 0;
743 	boolean_t has_feature = B_FALSE;
744 
745 	elem = NULL;
746 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
747 		uint64_t intval;
748 		const char *strval, *slash, *check, *fname;
749 		const char *propname = nvpair_name(elem);
750 		zpool_prop_t prop = zpool_name_to_prop(propname);
751 
752 		switch (prop) {
753 		case ZPOOL_PROP_INVAL:
754 			/*
755 			 * Sanitize the input.
756 			 */
757 			if (zfs_prop_user(propname)) {
758 				if (strlen(propname) >= ZAP_MAXNAMELEN) {
759 					error = SET_ERROR(ENAMETOOLONG);
760 					break;
761 				}
762 
763 				if (strlen(fnvpair_value_string(elem)) >=
764 				    ZAP_MAXVALUELEN) {
765 					error = SET_ERROR(E2BIG);
766 					break;
767 				}
768 			} else if (zpool_prop_feature(propname)) {
769 				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
770 					error = SET_ERROR(EINVAL);
771 					break;
772 				}
773 
774 				if (nvpair_value_uint64(elem, &intval) != 0) {
775 					error = SET_ERROR(EINVAL);
776 					break;
777 				}
778 
779 				if (intval != 0) {
780 					error = SET_ERROR(EINVAL);
781 					break;
782 				}
783 
784 				fname = strchr(propname, '@') + 1;
785 				if (zfeature_lookup_name(fname, NULL) != 0) {
786 					error = SET_ERROR(EINVAL);
787 					break;
788 				}
789 
790 				has_feature = B_TRUE;
791 			} else {
792 				error = SET_ERROR(EINVAL);
793 				break;
794 			}
795 			break;
796 
797 		case ZPOOL_PROP_VERSION:
798 			error = nvpair_value_uint64(elem, &intval);
799 			if (!error &&
800 			    (intval < spa_version(spa) ||
801 			    intval > SPA_VERSION_BEFORE_FEATURES ||
802 			    has_feature))
803 				error = SET_ERROR(EINVAL);
804 			break;
805 
806 		case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
807 			error = nvpair_value_uint64(elem, &intval);
808 			break;
809 
810 		case ZPOOL_PROP_DELEGATION:
811 		case ZPOOL_PROP_AUTOREPLACE:
812 		case ZPOOL_PROP_LISTSNAPS:
813 		case ZPOOL_PROP_AUTOEXPAND:
814 		case ZPOOL_PROP_AUTOTRIM:
815 			error = nvpair_value_uint64(elem, &intval);
816 			if (!error && intval > 1)
817 				error = SET_ERROR(EINVAL);
818 			break;
819 
820 		case ZPOOL_PROP_MULTIHOST:
821 			error = nvpair_value_uint64(elem, &intval);
822 			if (!error && intval > 1)
823 				error = SET_ERROR(EINVAL);
824 
825 			if (!error) {
826 				uint32_t hostid = zone_get_hostid(NULL);
827 				if (hostid)
828 					spa->spa_hostid = hostid;
829 				else
830 					error = SET_ERROR(ENOTSUP);
831 			}
832 
833 			break;
834 
835 		case ZPOOL_PROP_BOOTFS:
836 			/*
837 			 * If the pool version is less than SPA_VERSION_BOOTFS,
838 			 * or the pool is still being created (version == 0),
839 			 * the bootfs property cannot be set.
840 			 */
841 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
842 				error = SET_ERROR(ENOTSUP);
843 				break;
844 			}
845 
846 			/*
847 			 * Make sure the vdev config is bootable
848 			 */
849 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
850 				error = SET_ERROR(ENOTSUP);
851 				break;
852 			}
853 
854 			reset_bootfs = 1;
855 
856 			error = nvpair_value_string(elem, &strval);
857 
858 			if (!error) {
859 				objset_t *os;
860 
861 				if (strval == NULL || strval[0] == '\0') {
862 					objnum = zpool_prop_default_numeric(
863 					    ZPOOL_PROP_BOOTFS);
864 					break;
865 				}
866 
867 				error = dmu_objset_hold(strval, FTAG, &os);
868 				if (error != 0)
869 					break;
870 
871 				/* Must be ZPL. */
872 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
873 					error = SET_ERROR(ENOTSUP);
874 				} else {
875 					objnum = dmu_objset_id(os);
876 				}
877 				dmu_objset_rele(os, FTAG);
878 			}
879 			break;
880 
881 		case ZPOOL_PROP_FAILUREMODE:
882 			error = nvpair_value_uint64(elem, &intval);
883 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
884 				error = SET_ERROR(EINVAL);
885 
886 			/*
887 			 * This is a special case which only occurs when
888 			 * the pool has completely failed. This allows
889 			 * the user to change the in-core failmode property
890 			 * without syncing it out to disk (I/Os might
891 			 * currently be blocked). We do this by returning
892 			 * EIO to the caller (spa_prop_set) to trick it
893 			 * into thinking we encountered a property validation
894 			 * error.
895 			 */
896 			if (!error && spa_suspended(spa)) {
897 				spa->spa_failmode = intval;
898 				error = SET_ERROR(EIO);
899 			}
900 			break;
901 
902 		case ZPOOL_PROP_CACHEFILE:
903 			if ((error = nvpair_value_string(elem, &strval)) != 0)
904 				break;
905 
906 			if (strval[0] == '\0')
907 				break;
908 
909 			if (strcmp(strval, "none") == 0)
910 				break;
911 
912 			if (strval[0] != '/') {
913 				error = SET_ERROR(EINVAL);
914 				break;
915 			}
916 
917 			slash = strrchr(strval, '/');
918 			ASSERT(slash != NULL);
919 
920 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
921 			    strcmp(slash, "/..") == 0)
922 				error = SET_ERROR(EINVAL);
923 			break;
924 
925 		case ZPOOL_PROP_COMMENT:
926 			if ((error = nvpair_value_string(elem, &strval)) != 0)
927 				break;
928 			for (check = strval; *check != '\0'; check++) {
929 				if (!isprint(*check)) {
930 					error = SET_ERROR(EINVAL);
931 					break;
932 				}
933 			}
934 			if (strlen(strval) > ZPROP_MAX_COMMENT)
935 				error = SET_ERROR(E2BIG);
936 			break;
937 
938 		default:
939 			break;
940 		}
941 
942 		if (error)
943 			break;
944 	}
945 
946 	(void) nvlist_remove_all(props,
947 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
948 
949 	if (!error && reset_bootfs) {
950 		error = nvlist_remove(props,
951 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
952 
953 		if (!error) {
954 			error = nvlist_add_uint64(props,
955 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
956 		}
957 	}
958 
959 	return (error);
960 }
961 
962 void
spa_configfile_set(spa_t * spa,nvlist_t * nvp,boolean_t need_sync)963 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
964 {
965 	const char *cachefile;
966 	spa_config_dirent_t *dp;
967 
968 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
969 	    &cachefile) != 0)
970 		return;
971 
972 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
973 	    KM_SLEEP);
974 
975 	if (cachefile[0] == '\0')
976 		dp->scd_path = spa_strdup(spa_config_path);
977 	else if (strcmp(cachefile, "none") == 0)
978 		dp->scd_path = NULL;
979 	else
980 		dp->scd_path = spa_strdup(cachefile);
981 
982 	list_insert_head(&spa->spa_config_list, dp);
983 	if (need_sync)
984 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
985 }
986 
987 int
spa_prop_set(spa_t * spa,nvlist_t * nvp)988 spa_prop_set(spa_t *spa, nvlist_t *nvp)
989 {
990 	int error;
991 	nvpair_t *elem = NULL;
992 	boolean_t need_sync = B_FALSE;
993 
994 	if ((error = spa_prop_validate(spa, nvp)) != 0)
995 		return (error);
996 
997 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
998 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
999 
1000 		if (prop == ZPOOL_PROP_CACHEFILE ||
1001 		    prop == ZPOOL_PROP_ALTROOT ||
1002 		    prop == ZPOOL_PROP_READONLY)
1003 			continue;
1004 
1005 		if (prop == ZPOOL_PROP_INVAL &&
1006 		    zfs_prop_user(nvpair_name(elem))) {
1007 			need_sync = B_TRUE;
1008 			break;
1009 		}
1010 
1011 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
1012 			uint64_t ver = 0;
1013 
1014 			if (prop == ZPOOL_PROP_VERSION) {
1015 				VERIFY0(nvpair_value_uint64(elem, &ver));
1016 			} else {
1017 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
1018 				ver = SPA_VERSION_FEATURES;
1019 				need_sync = B_TRUE;
1020 			}
1021 
1022 			/* Save time if the version is already set. */
1023 			if (ver == spa_version(spa))
1024 				continue;
1025 
1026 			/*
1027 			 * In addition to the pool directory object, we might
1028 			 * create the pool properties object, the features for
1029 			 * read object, the features for write object, or the
1030 			 * feature descriptions object.
1031 			 */
1032 			error = dsl_sync_task(spa->spa_name, NULL,
1033 			    spa_sync_version, &ver,
1034 			    6, ZFS_SPACE_CHECK_RESERVED);
1035 			if (error)
1036 				return (error);
1037 			continue;
1038 		}
1039 
1040 		need_sync = B_TRUE;
1041 		break;
1042 	}
1043 
1044 	if (need_sync) {
1045 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
1046 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
1047 	}
1048 
1049 	return (0);
1050 }
1051 
1052 /*
1053  * If the bootfs property value is dsobj, clear it.
1054  */
1055 void
spa_prop_clear_bootfs(spa_t * spa,uint64_t dsobj,dmu_tx_t * tx)1056 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
1057 {
1058 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
1059 		VERIFY(zap_remove(spa->spa_meta_objset,
1060 		    spa->spa_pool_props_object,
1061 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
1062 		spa->spa_bootfs = 0;
1063 	}
1064 }
1065 
1066 static int
spa_change_guid_check(void * arg,dmu_tx_t * tx)1067 spa_change_guid_check(void *arg, dmu_tx_t *tx)
1068 {
1069 	uint64_t *newguid __maybe_unused = arg;
1070 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1071 	vdev_t *rvd = spa->spa_root_vdev;
1072 	uint64_t vdev_state;
1073 
1074 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
1075 		int error = (spa_has_checkpoint(spa)) ?
1076 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
1077 		return (SET_ERROR(error));
1078 	}
1079 
1080 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1081 	vdev_state = rvd->vdev_state;
1082 	spa_config_exit(spa, SCL_STATE, FTAG);
1083 
1084 	if (vdev_state != VDEV_STATE_HEALTHY)
1085 		return (SET_ERROR(ENXIO));
1086 
1087 	ASSERT3U(spa_guid(spa), !=, *newguid);
1088 
1089 	return (0);
1090 }
1091 
1092 static void
spa_change_guid_sync(void * arg,dmu_tx_t * tx)1093 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
1094 {
1095 	uint64_t *newguid = arg;
1096 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1097 	uint64_t oldguid;
1098 	vdev_t *rvd = spa->spa_root_vdev;
1099 
1100 	oldguid = spa_guid(spa);
1101 
1102 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1103 	rvd->vdev_guid = *newguid;
1104 	rvd->vdev_guid_sum += (*newguid - oldguid);
1105 	vdev_config_dirty(rvd);
1106 	spa_config_exit(spa, SCL_STATE, FTAG);
1107 
1108 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
1109 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
1110 }
1111 
1112 /*
1113  * Change the GUID for the pool.  This is done so that we can later
1114  * re-import a pool built from a clone of our own vdevs.  We will modify
1115  * the root vdev's guid, our own pool guid, and then mark all of our
1116  * vdevs dirty.  Note that we must make sure that all our vdevs are
1117  * online when we do this, or else any vdevs that weren't present
1118  * would be orphaned from our pool.  We are also going to issue a
1119  * sysevent to update any watchers.
1120  *
1121  * The GUID of the pool will be changed to the value pointed to by guidp.
1122  * The GUID may not be set to the reserverd value of 0.
1123  * The new GUID will be generated if guidp is NULL.
1124  */
1125 int
spa_change_guid(spa_t * spa,const uint64_t * guidp)1126 spa_change_guid(spa_t *spa, const uint64_t *guidp)
1127 {
1128 	uint64_t guid;
1129 	int error;
1130 
1131 	mutex_enter(&spa->spa_vdev_top_lock);
1132 	spa_namespace_enter(FTAG);
1133 
1134 	if (guidp != NULL) {
1135 		guid = *guidp;
1136 		if (guid == 0) {
1137 			error = SET_ERROR(EINVAL);
1138 			goto out;
1139 		}
1140 
1141 		if (spa_guid_exists(guid, 0)) {
1142 			error = SET_ERROR(EEXIST);
1143 			goto out;
1144 		}
1145 	} else {
1146 		guid = spa_generate_guid(NULL);
1147 	}
1148 
1149 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
1150 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
1151 
1152 	if (error == 0) {
1153 		/*
1154 		 * Clear the kobj flag from all the vdevs to allow
1155 		 * vdev_cache_process_kobj_evt() to post events to all the
1156 		 * vdevs since GUID is updated.
1157 		 */
1158 		vdev_clear_kobj_evt(spa->spa_root_vdev);
1159 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
1160 			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
1161 
1162 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
1163 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
1164 	}
1165 
1166 out:
1167 	spa_namespace_exit(FTAG);
1168 	mutex_exit(&spa->spa_vdev_top_lock);
1169 
1170 	return (error);
1171 }
1172 
1173 /*
1174  * ==========================================================================
1175  * SPA state manipulation (open/create/destroy/import/export)
1176  * ==========================================================================
1177  */
1178 
1179 static int
spa_error_entry_compare(const void * a,const void * b)1180 spa_error_entry_compare(const void *a, const void *b)
1181 {
1182 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
1183 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
1184 	int ret;
1185 
1186 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
1187 	    sizeof (zbookmark_phys_t));
1188 
1189 	return (TREE_ISIGN(ret));
1190 }
1191 
1192 /*
1193  * Utility function which retrieves copies of the current logs and
1194  * re-initializes them in the process.
1195  */
1196 void
spa_get_errlists(spa_t * spa,avl_tree_t * last,avl_tree_t * scrub)1197 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
1198 {
1199 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
1200 
1201 	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
1202 	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
1203 
1204 	avl_create(&spa->spa_errlist_scrub,
1205 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1206 	    offsetof(spa_error_entry_t, se_avl));
1207 	avl_create(&spa->spa_errlist_last,
1208 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1209 	    offsetof(spa_error_entry_t, se_avl));
1210 }
1211 
1212 static void
spa_taskqs_init(spa_t * spa,zio_type_t t,zio_taskq_type_t q)1213 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1214 {
1215 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
1216 	enum zti_modes mode = ztip->zti_mode;
1217 	uint_t value = ztip->zti_value;
1218 	uint_t count = ztip->zti_count;
1219 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1220 	uint_t cpus, threads, flags = TASKQ_DYNAMIC;
1221 
1222 	switch (mode) {
1223 	case ZTI_MODE_FIXED:
1224 		ASSERT3U(value, >, 0);
1225 		break;
1226 
1227 	case ZTI_MODE_SYNC:
1228 
1229 		/*
1230 		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
1231 		 * not to exceed the number of spa allocators, and align to it.
1232 		 */
1233 		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
1234 		count = MAX(1, threads / MAX(1, zio_taskq_write_tpq));
1235 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1236 		count = MIN(count, spa->spa_alloc_count);
1237 		while (spa->spa_alloc_count % count != 0 &&
1238 		    spa->spa_alloc_count < count * 2)
1239 			count--;
1240 
1241 		/*
1242 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
1243 		 * single taskq may have more threads than 100% of online cpus.
1244 		 */
1245 		value = (zio_taskq_batch_pct + count / 2) / count;
1246 		value = MIN(value, 100);
1247 		flags |= TASKQ_THREADS_CPU_PCT;
1248 		break;
1249 
1250 	case ZTI_MODE_SCALE:
1251 		/*
1252 		 * We want more taskqs to reduce lock contention, but we want
1253 		 * less for better request ordering and CPU utilization.
1254 		 */
1255 		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
1256 		threads = MAX(threads, value);
1257 		if (zio_taskq_batch_tpq > 0) {
1258 			count = MAX(1, (threads + zio_taskq_batch_tpq / 2) /
1259 			    zio_taskq_batch_tpq);
1260 		} else {
1261 			/*
1262 			 * Prefer 6 threads per taskq, but no more taskqs
1263 			 * than threads in them on large systems. For 80%:
1264 			 *
1265 			 *                 taskq   taskq   total
1266 			 * cpus    taskqs  percent threads threads
1267 			 * ------- ------- ------- ------- -------
1268 			 * 1       1       80%     1       1
1269 			 * 2       1       80%     1       1
1270 			 * 4       1       80%     3       3
1271 			 * 8       2       40%     3       6
1272 			 * 16      3       27%     4       12
1273 			 * 32      5       16%     5       25
1274 			 * 64      7       11%     7       49
1275 			 * 128     10      8%      10      100
1276 			 * 256     14      6%      15      210
1277 			 */
1278 			cpus = MIN(threads, boot_ncpus);
1279 			count = 1 + threads / 6;
1280 			while (count * count > cpus)
1281 				count--;
1282 		}
1283 
1284 		/*
1285 		 * Try to represent the number of threads per taskq as percent
1286 		 * of online CPUs to allow scaling with later online/offline.
1287 		 * Fall back to absolute numbers if can't.
1288 		 */
1289 		value = (threads * 100 + boot_ncpus * count / 2) /
1290 		    (boot_ncpus * count);
1291 		if (value < 5 || value > 100)
1292 			value = MAX(1, (threads + count / 2) / count);
1293 		else
1294 			flags |= TASKQ_THREADS_CPU_PCT;
1295 		break;
1296 
1297 	case ZTI_MODE_NULL:
1298 		tqs->stqs_count = 0;
1299 		tqs->stqs_taskq = NULL;
1300 		return;
1301 
1302 	default:
1303 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
1304 		    "spa_taskqs_init()",
1305 		    zio_type_name[t], zio_taskq_types[q], mode, value);
1306 		break;
1307 	}
1308 
1309 	ASSERT3U(count, >, 0);
1310 	tqs->stqs_count = count;
1311 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
1312 
1313 	for (uint_t i = 0; i < count; i++) {
1314 		taskq_t *tq;
1315 		char name[32];
1316 
1317 		if (count > 1)
1318 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
1319 			    zio_type_name[t], zio_taskq_types[q], i);
1320 		else
1321 			(void) snprintf(name, sizeof (name), "%s_%s",
1322 			    zio_type_name[t], zio_taskq_types[q]);
1323 
1324 #ifdef HAVE_SYSDC
1325 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
1326 			(void) zio_taskq_basedc;
1327 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1328 			    spa->spa_proc, zio_taskq_basedc, flags);
1329 		} else {
1330 #endif
1331 			/*
1332 			 * The write issue taskq can be extremely CPU
1333 			 * intensive.  Run it at slightly less important
1334 			 * priority than the other taskqs.
1335 			 */
1336 			const pri_t pri = (t == ZIO_TYPE_WRITE &&
1337 			    q == ZIO_TASKQ_ISSUE) ?
1338 			    wtqclsyspri : maxclsyspri;
1339 			tq = taskq_create_proc(name, value, pri, 50,
1340 			    INT_MAX, spa->spa_proc, flags);
1341 #ifdef HAVE_SYSDC
1342 		}
1343 #endif
1344 
1345 		tqs->stqs_taskq[i] = tq;
1346 	}
1347 }
1348 
1349 static void
spa_taskqs_fini(spa_t * spa,zio_type_t t,zio_taskq_type_t q)1350 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1351 {
1352 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1353 
1354 	if (tqs->stqs_taskq == NULL) {
1355 		ASSERT0(tqs->stqs_count);
1356 		return;
1357 	}
1358 
1359 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
1360 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1361 		taskq_destroy(tqs->stqs_taskq[i]);
1362 	}
1363 
1364 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1365 	tqs->stqs_taskq = NULL;
1366 }
1367 
1368 #ifdef _KERNEL
1369 /*
1370  * The READ and WRITE rows of zio_taskqs are configurable at module load time
1371  * by setting zio_taskq_read or zio_taskq_write.
1372  *
1373  * Example (the defaults for READ and WRITE)
1374  *   zio_taskq_read='fixed,1,8 null scale null'
1375  *   zio_taskq_write='sync null scale null'
1376  *
1377  * Each sets the entire row at a time.
1378  *
1379  * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
1380  * of threads per taskq.
1381  *
1382  * 'null' can only be set on the high-priority queues (queue selection for
1383  * high-priority queues will fall back to the regular queue if the high-pri
1384  * is NULL.
1385  */
1386 static const char *const modes[ZTI_NMODES] = {
1387 	"fixed", "scale", "sync", "null"
1388 };
1389 
1390 /* Parse the incoming config string. Modifies cfg */
1391 static int
spa_taskq_param_set(zio_type_t t,char * cfg)1392 spa_taskq_param_set(zio_type_t t, char *cfg)
1393 {
1394 	int err = 0;
1395 
1396 	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
1397 
1398 	char *next = cfg, *tok, *c;
1399 
1400 	/*
1401 	 * Parse out each element from the string and fill `row`. The entire
1402 	 * row has to be set at once, so any errors are flagged by just
1403 	 * breaking out of this loop early.
1404 	 */
1405 	uint_t q;
1406 	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1407 		/* `next` is the start of the config */
1408 		if (next == NULL)
1409 			break;
1410 
1411 		/* Eat up leading space */
1412 		while (isspace(*next))
1413 			next++;
1414 		if (*next == '\0')
1415 			break;
1416 
1417 		/* Mode ends at space or end of string */
1418 		tok = next;
1419 		next = strchr(tok, ' ');
1420 		if (next != NULL) *next++ = '\0';
1421 
1422 		/* Parameters start after a comma */
1423 		c = strchr(tok, ',');
1424 		if (c != NULL) *c++ = '\0';
1425 
1426 		/* Match mode string */
1427 		uint_t mode;
1428 		for (mode = 0; mode < ZTI_NMODES; mode++)
1429 			if (strcmp(tok, modes[mode]) == 0)
1430 				break;
1431 		if (mode == ZTI_NMODES)
1432 			break;
1433 
1434 		/* Invalid canary */
1435 		row[q].zti_mode = ZTI_NMODES;
1436 
1437 		/* Per-mode setup */
1438 		switch (mode) {
1439 
1440 		/*
1441 		 * FIXED is parameterised: number of queues, and number of
1442 		 * threads per queue.
1443 		 */
1444 		case ZTI_MODE_FIXED: {
1445 			/* No parameters? */
1446 			if (c == NULL || *c == '\0')
1447 				break;
1448 
1449 			/* Find next parameter */
1450 			tok = c;
1451 			c = strchr(tok, ',');
1452 			if (c == NULL)
1453 				break;
1454 
1455 			/* Take digits and convert */
1456 			unsigned long long nq;
1457 			if (!(isdigit(*tok)))
1458 				break;
1459 			err = ddi_strtoull(tok, &tok, 10, &nq);
1460 			/* Must succeed and also end at the next param sep */
1461 			if (err != 0 || tok != c)
1462 				break;
1463 
1464 			/* Move past the comma */
1465 			tok++;
1466 			/* Need another number */
1467 			if (!(isdigit(*tok)))
1468 				break;
1469 			/* Remember start to make sure we moved */
1470 			c = tok;
1471 
1472 			/* Take digits */
1473 			unsigned long long ntpq;
1474 			err = ddi_strtoull(tok, &tok, 10, &ntpq);
1475 			/* Must succeed, and moved forward */
1476 			if (err != 0 || tok == c || *tok != '\0')
1477 				break;
1478 
1479 			/*
1480 			 * sanity; zero queues/threads make no sense, and
1481 			 * 16K is almost certainly more than anyone will ever
1482 			 * need and avoids silly numbers like UINT32_MAX
1483 			 */
1484 			if (nq == 0 || nq >= 16384 ||
1485 			    ntpq == 0 || ntpq >= 16384)
1486 				break;
1487 
1488 			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
1489 			row[q] = zti;
1490 			break;
1491 		}
1492 
1493 		/*
1494 		 * SCALE is optionally parameterised by minimum number of
1495 		 * threads.
1496 		 */
1497 		case ZTI_MODE_SCALE: {
1498 			unsigned long long mint = 0;
1499 			if (c != NULL && *c != '\0') {
1500 				/* Need a number */
1501 				if (!(isdigit(*c)))
1502 					break;
1503 				tok = c;
1504 
1505 				/* Take digits */
1506 				err = ddi_strtoull(tok, &tok, 10, &mint);
1507 				/* Must succeed, and moved forward */
1508 				if (err != 0 || tok == c || *tok != '\0')
1509 					break;
1510 
1511 				/* Sanity check */
1512 				if (mint >= 16384)
1513 					break;
1514 			}
1515 
1516 			const zio_taskq_info_t zti = ZTI_SCALE(mint);
1517 			row[q] = zti;
1518 			break;
1519 		}
1520 
1521 		case ZTI_MODE_SYNC: {
1522 			const zio_taskq_info_t zti = ZTI_SYNC;
1523 			row[q] = zti;
1524 			break;
1525 		}
1526 
1527 		case ZTI_MODE_NULL: {
1528 			/*
1529 			 * Can only null the high-priority queues; the general-
1530 			 * purpose ones have to exist.
1531 			 */
1532 			if (q != ZIO_TASKQ_ISSUE_HIGH &&
1533 			    q != ZIO_TASKQ_INTERRUPT_HIGH)
1534 				break;
1535 
1536 			const zio_taskq_info_t zti = ZTI_NULL;
1537 			row[q] = zti;
1538 			break;
1539 		}
1540 
1541 		default:
1542 			break;
1543 		}
1544 
1545 		/* Ensure we set a mode */
1546 		if (row[q].zti_mode == ZTI_NMODES)
1547 			break;
1548 	}
1549 
1550 	/* Didn't get a full row, fail */
1551 	if (q < ZIO_TASKQ_TYPES)
1552 		return (SET_ERROR(EINVAL));
1553 
1554 	/* Eat trailing space */
1555 	if (next != NULL)
1556 		while (isspace(*next))
1557 			next++;
1558 
1559 	/* If there's anything left over then fail */
1560 	if (next != NULL && *next != '\0')
1561 		return (SET_ERROR(EINVAL));
1562 
1563 	/* Success! Copy it into the real config */
1564 	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
1565 		zio_taskqs[t][q] = row[q];
1566 
1567 	return (0);
1568 }
1569 
1570 static int
spa_taskq_param_get(zio_type_t t,char * buf,boolean_t add_newline)1571 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
1572 {
1573 	int pos = 0;
1574 
1575 	/* Build paramater string from live config */
1576 	const char *sep = "";
1577 	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
1578 		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
1579 		if (zti->zti_mode == ZTI_MODE_FIXED)
1580 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
1581 			    modes[zti->zti_mode], zti->zti_count,
1582 			    zti->zti_value);
1583 		else if (zti->zti_mode == ZTI_MODE_SCALE && zti->zti_value > 0)
1584 			pos += sprintf(&buf[pos], "%s%s,%u", sep,
1585 			    modes[zti->zti_mode], zti->zti_value);
1586 		else
1587 			pos += sprintf(&buf[pos], "%s%s", sep,
1588 			    modes[zti->zti_mode]);
1589 		sep = " ";
1590 	}
1591 
1592 	if (add_newline)
1593 		buf[pos++] = '\n';
1594 	buf[pos] = '\0';
1595 
1596 	return (pos);
1597 }
1598 
1599 #ifdef __linux__
1600 static int
spa_taskq_read_param_set(const char * val,zfs_kernel_param_t * kp)1601 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
1602 {
1603 	char *cfg = kmem_strdup(val);
1604 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
1605 	kmem_strfree(cfg);
1606 	return (-err);
1607 }
1608 
1609 static int
spa_taskq_read_param_get(char * buf,zfs_kernel_param_t * kp)1610 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
1611 {
1612 	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
1613 }
1614 
1615 static int
spa_taskq_write_param_set(const char * val,zfs_kernel_param_t * kp)1616 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
1617 {
1618 	char *cfg = kmem_strdup(val);
1619 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
1620 	kmem_strfree(cfg);
1621 	return (-err);
1622 }
1623 
1624 static int
spa_taskq_write_param_get(char * buf,zfs_kernel_param_t * kp)1625 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
1626 {
1627 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
1628 }
1629 
1630 static int
spa_taskq_free_param_set(const char * val,zfs_kernel_param_t * kp)1631 spa_taskq_free_param_set(const char *val, zfs_kernel_param_t *kp)
1632 {
1633 	char *cfg = kmem_strdup(val);
1634 	int err = spa_taskq_param_set(ZIO_TYPE_FREE, cfg);
1635 	kmem_strfree(cfg);
1636 	return (-err);
1637 }
1638 
1639 static int
spa_taskq_free_param_get(char * buf,zfs_kernel_param_t * kp)1640 spa_taskq_free_param_get(char *buf, zfs_kernel_param_t *kp)
1641 {
1642 	return (spa_taskq_param_get(ZIO_TYPE_FREE, buf, TRUE));
1643 }
1644 #else
1645 /*
1646  * On FreeBSD load-time parameters can be set up before malloc() is available,
1647  * so we have to do all the parsing work on the stack.
1648  */
1649 #define	SPA_TASKQ_PARAM_MAX	(128)
1650 
1651 static int
spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)1652 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
1653 {
1654 	char buf[SPA_TASKQ_PARAM_MAX];
1655 	int err;
1656 
1657 	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
1658 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1659 	if (err || req->newptr == NULL)
1660 		return (err);
1661 	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
1662 }
1663 
1664 static int
spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)1665 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
1666 {
1667 	char buf[SPA_TASKQ_PARAM_MAX];
1668 	int err;
1669 
1670 	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
1671 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1672 	if (err || req->newptr == NULL)
1673 		return (err);
1674 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
1675 }
1676 
1677 static int
spa_taskq_free_param(ZFS_MODULE_PARAM_ARGS)1678 spa_taskq_free_param(ZFS_MODULE_PARAM_ARGS)
1679 {
1680 	char buf[SPA_TASKQ_PARAM_MAX];
1681 	int err;
1682 
1683 	(void) spa_taskq_param_get(ZIO_TYPE_FREE, buf, FALSE);
1684 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1685 	if (err || req->newptr == NULL)
1686 		return (err);
1687 	return (spa_taskq_param_set(ZIO_TYPE_FREE, buf));
1688 }
1689 #endif
1690 #endif /* _KERNEL */
1691 
1692 /*
1693  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1694  * Note that a type may have multiple discrete taskqs to avoid lock contention
1695  * on the taskq itself.
1696  */
1697 void
spa_taskq_dispatch(spa_t * spa,zio_type_t t,zio_taskq_type_t q,task_func_t * func,zio_t * zio,boolean_t cutinline)1698 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1699     task_func_t *func, zio_t *zio, boolean_t cutinline)
1700 {
1701 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1702 	taskq_t *tq;
1703 
1704 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1705 	ASSERT3U(tqs->stqs_count, !=, 0);
1706 
1707 	/*
1708 	 * NB: We are assuming that the zio can only be dispatched
1709 	 * to a single taskq at a time.  It would be a grievous error
1710 	 * to dispatch the zio to another taskq at the same time.
1711 	 */
1712 	ASSERT(zio);
1713 	ASSERT(taskq_empty_ent(&zio->io_tqent));
1714 
1715 	if (tqs->stqs_count == 1) {
1716 		tq = tqs->stqs_taskq[0];
1717 	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
1718 	    ZIO_HAS_ALLOCATOR(zio)) {
1719 		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
1720 	} else {
1721 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
1722 	}
1723 
1724 	taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
1725 	    &zio->io_tqent);
1726 }
1727 
1728 static void
spa_create_zio_taskqs(spa_t * spa)1729 spa_create_zio_taskqs(spa_t *spa)
1730 {
1731 	for (int t = 0; t < ZIO_TYPES; t++) {
1732 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1733 			spa_taskqs_init(spa, t, q);
1734 		}
1735 	}
1736 }
1737 
1738 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
1739 static void
spa_thread(void * arg)1740 spa_thread(void *arg)
1741 {
1742 	psetid_t zio_taskq_psrset_bind = PS_NONE;
1743 	callb_cpr_t cprinfo;
1744 
1745 	spa_t *spa = arg;
1746 	user_t *pu = PTOU(curproc);
1747 
1748 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1749 	    spa->spa_name);
1750 
1751 	ASSERT(curproc != &p0);
1752 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1753 	    "zpool-%s", spa->spa_name);
1754 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1755 
1756 	/* bind this thread to the requested psrset */
1757 	if (zio_taskq_psrset_bind != PS_NONE) {
1758 		pool_lock();
1759 		mutex_enter(&cpu_lock);
1760 		mutex_enter(&pidlock);
1761 		mutex_enter(&curproc->p_lock);
1762 
1763 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1764 		    0, NULL, NULL) == 0)  {
1765 			curthread->t_bind_pset = zio_taskq_psrset_bind;
1766 		} else {
1767 			cmn_err(CE_WARN,
1768 			    "Couldn't bind process for zfs pool \"%s\" to "
1769 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1770 		}
1771 
1772 		mutex_exit(&curproc->p_lock);
1773 		mutex_exit(&pidlock);
1774 		mutex_exit(&cpu_lock);
1775 		pool_unlock();
1776 	}
1777 
1778 #ifdef HAVE_SYSDC
1779 	if (zio_taskq_sysdc) {
1780 		sysdc_thread_enter(curthread, 100, 0);
1781 	}
1782 #endif
1783 
1784 	spa->spa_proc = curproc;
1785 	spa->spa_did = curthread->t_did;
1786 
1787 	spa_create_zio_taskqs(spa);
1788 
1789 	mutex_enter(&spa->spa_proc_lock);
1790 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1791 
1792 	spa->spa_proc_state = SPA_PROC_ACTIVE;
1793 	cv_broadcast(&spa->spa_proc_cv);
1794 
1795 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1796 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1797 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1798 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1799 
1800 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1801 	spa->spa_proc_state = SPA_PROC_GONE;
1802 	spa->spa_proc = &p0;
1803 	cv_broadcast(&spa->spa_proc_cv);
1804 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1805 
1806 	mutex_enter(&curproc->p_lock);
1807 	lwp_exit();
1808 }
1809 #endif
1810 
1811 extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
1812 
1813 /*
1814  * Activate an uninitialized pool.
1815  */
1816 static void
spa_activate(spa_t * spa,spa_mode_t mode)1817 spa_activate(spa_t *spa, spa_mode_t mode)
1818 {
1819 	metaslab_ops_t *msp = metaslab_allocator(spa);
1820 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1821 
1822 	spa->spa_state = POOL_STATE_ACTIVE;
1823 	spa->spa_final_txg = UINT64_MAX;
1824 	spa->spa_mode = mode;
1825 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
1826 
1827 	spa->spa_normal_class = metaslab_class_create(spa, "normal",
1828 	    msp, B_FALSE);
1829 	spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE);
1830 	spa->spa_embedded_log_class = metaslab_class_create(spa,
1831 	    "embedded_log", msp, B_TRUE);
1832 	spa->spa_special_class = metaslab_class_create(spa, "special",
1833 	    msp, B_FALSE);
1834 	spa->spa_special_embedded_log_class = metaslab_class_create(spa,
1835 	    "special_embedded_log", msp, B_TRUE);
1836 	spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
1837 	    msp, B_FALSE);
1838 
1839 	/* Try to create a covering process */
1840 	mutex_enter(&spa->spa_proc_lock);
1841 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1842 	ASSERT(spa->spa_proc == &p0);
1843 	spa->spa_did = 0;
1844 
1845 #ifdef HAVE_SPA_THREAD
1846 	/* Only create a process if we're going to be around a while. */
1847 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1848 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1849 		    NULL, 0) == 0) {
1850 			spa->spa_proc_state = SPA_PROC_CREATED;
1851 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1852 				cv_wait(&spa->spa_proc_cv,
1853 				    &spa->spa_proc_lock);
1854 			}
1855 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1856 			ASSERT(spa->spa_proc != &p0);
1857 			ASSERT(spa->spa_did != 0);
1858 		} else {
1859 #ifdef _KERNEL
1860 			cmn_err(CE_WARN,
1861 			    "Couldn't create process for zfs pool \"%s\"\n",
1862 			    spa->spa_name);
1863 #endif
1864 		}
1865 	}
1866 #endif /* HAVE_SPA_THREAD */
1867 	mutex_exit(&spa->spa_proc_lock);
1868 
1869 	/* If we didn't create a process, we need to create our taskqs. */
1870 	if (spa->spa_proc == &p0) {
1871 		spa_create_zio_taskqs(spa);
1872 	}
1873 
1874 	for (size_t i = 0; i < TXG_SIZE; i++) {
1875 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1876 		    ZIO_FLAG_CANFAIL);
1877 	}
1878 
1879 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1880 	    offsetof(vdev_t, vdev_config_dirty_node));
1881 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1882 	    offsetof(objset_t, os_evicting_node));
1883 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1884 	    offsetof(vdev_t, vdev_state_dirty_node));
1885 
1886 	txg_list_create(&spa->spa_vdev_txg_list, spa,
1887 	    offsetof(struct vdev, vdev_txg_node));
1888 
1889 	avl_create(&spa->spa_errlist_scrub,
1890 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1891 	    offsetof(spa_error_entry_t, se_avl));
1892 	avl_create(&spa->spa_errlist_last,
1893 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1894 	    offsetof(spa_error_entry_t, se_avl));
1895 	avl_create(&spa->spa_errlist_healed,
1896 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1897 	    offsetof(spa_error_entry_t, se_avl));
1898 
1899 	spa_activate_os(spa);
1900 
1901 	spa_keystore_init(&spa->spa_keystore);
1902 
1903 	/*
1904 	 * This taskq is used to perform zvol-minor-related tasks
1905 	 * asynchronously. This has several advantages, including easy
1906 	 * resolution of various deadlocks.
1907 	 *
1908 	 * The taskq must be single threaded to ensure tasks are always
1909 	 * processed in the order in which they were dispatched.
1910 	 *
1911 	 * A taskq per pool allows one to keep the pools independent.
1912 	 * This way if one pool is suspended, it will not impact another.
1913 	 *
1914 	 * The preferred location to dispatch a zvol minor task is a sync
1915 	 * task. In this context, there is easy access to the spa_t and minimal
1916 	 * error handling is required because the sync task must succeed.
1917 	 */
1918 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1919 	    1, INT_MAX, 0);
1920 
1921 	/*
1922 	 * The taskq to preload metaslabs.
1923 	 */
1924 	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
1925 	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
1926 	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1927 
1928 	/*
1929 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
1930 	 * pool traverse code from monopolizing the global (and limited)
1931 	 * system_taskq by inappropriately scheduling long running tasks on it.
1932 	 */
1933 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
1934 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1935 
1936 	/*
1937 	 * The taskq to upgrade datasets in this pool. Currently used by
1938 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1939 	 */
1940 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
1941 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1942 }
1943 
1944 /*
1945  * Opposite of spa_activate().
1946  */
1947 static void
spa_deactivate(spa_t * spa)1948 spa_deactivate(spa_t *spa)
1949 {
1950 	ASSERT(spa->spa_sync_on == B_FALSE);
1951 	ASSERT0P(spa->spa_dsl_pool);
1952 	ASSERT0P(spa->spa_root_vdev);
1953 	ASSERT0P(spa->spa_async_zio_root);
1954 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1955 
1956 	spa_evicting_os_wait(spa);
1957 
1958 	if (spa->spa_zvol_taskq) {
1959 		taskq_destroy(spa->spa_zvol_taskq);
1960 		spa->spa_zvol_taskq = NULL;
1961 	}
1962 
1963 	if (spa->spa_metaslab_taskq) {
1964 		taskq_destroy(spa->spa_metaslab_taskq);
1965 		spa->spa_metaslab_taskq = NULL;
1966 	}
1967 
1968 	if (spa->spa_prefetch_taskq) {
1969 		taskq_destroy(spa->spa_prefetch_taskq);
1970 		spa->spa_prefetch_taskq = NULL;
1971 	}
1972 
1973 	if (spa->spa_upgrade_taskq) {
1974 		taskq_destroy(spa->spa_upgrade_taskq);
1975 		spa->spa_upgrade_taskq = NULL;
1976 	}
1977 
1978 	txg_list_destroy(&spa->spa_vdev_txg_list);
1979 
1980 	list_destroy(&spa->spa_config_dirty_list);
1981 	list_destroy(&spa->spa_evicting_os_list);
1982 	list_destroy(&spa->spa_state_dirty_list);
1983 
1984 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
1985 
1986 	for (int t = 0; t < ZIO_TYPES; t++) {
1987 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1988 			spa_taskqs_fini(spa, t, q);
1989 		}
1990 	}
1991 
1992 	for (size_t i = 0; i < TXG_SIZE; i++) {
1993 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1994 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1995 		spa->spa_txg_zio[i] = NULL;
1996 	}
1997 
1998 	metaslab_class_destroy(spa->spa_normal_class);
1999 	spa->spa_normal_class = NULL;
2000 
2001 	metaslab_class_destroy(spa->spa_log_class);
2002 	spa->spa_log_class = NULL;
2003 
2004 	metaslab_class_destroy(spa->spa_embedded_log_class);
2005 	spa->spa_embedded_log_class = NULL;
2006 
2007 	metaslab_class_destroy(spa->spa_special_class);
2008 	spa->spa_special_class = NULL;
2009 
2010 	metaslab_class_destroy(spa->spa_special_embedded_log_class);
2011 	spa->spa_special_embedded_log_class = NULL;
2012 
2013 	metaslab_class_destroy(spa->spa_dedup_class);
2014 	spa->spa_dedup_class = NULL;
2015 
2016 	/*
2017 	 * If this was part of an import or the open otherwise failed, we may
2018 	 * still have errors left in the queues.  Empty them just in case.
2019 	 */
2020 	spa_errlog_drain(spa);
2021 	avl_destroy(&spa->spa_errlist_scrub);
2022 	avl_destroy(&spa->spa_errlist_last);
2023 	avl_destroy(&spa->spa_errlist_healed);
2024 
2025 	spa_keystore_fini(&spa->spa_keystore);
2026 
2027 	spa->spa_state = POOL_STATE_UNINITIALIZED;
2028 
2029 	mutex_enter(&spa->spa_proc_lock);
2030 	if (spa->spa_proc_state != SPA_PROC_NONE) {
2031 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
2032 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
2033 		cv_broadcast(&spa->spa_proc_cv);
2034 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
2035 			ASSERT(spa->spa_proc != &p0);
2036 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
2037 		}
2038 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
2039 		spa->spa_proc_state = SPA_PROC_NONE;
2040 	}
2041 	ASSERT(spa->spa_proc == &p0);
2042 	mutex_exit(&spa->spa_proc_lock);
2043 
2044 	/*
2045 	 * We want to make sure spa_thread() has actually exited the ZFS
2046 	 * module, so that the module can't be unloaded out from underneath
2047 	 * it.
2048 	 */
2049 	if (spa->spa_did != 0) {
2050 		thread_join(spa->spa_did);
2051 		spa->spa_did = 0;
2052 	}
2053 
2054 	spa_deactivate_os(spa);
2055 
2056 }
2057 
2058 /*
2059  * Verify a pool configuration, and construct the vdev tree appropriately.  This
2060  * will create all the necessary vdevs in the appropriate layout, with each vdev
2061  * in the CLOSED state.  This will prep the pool before open/creation/import.
2062  * All vdev validation is done by the vdev_alloc() routine.
2063  */
2064 int
spa_config_parse(spa_t * spa,vdev_t ** vdp,nvlist_t * nv,vdev_t * parent,uint_t id,int atype)2065 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
2066     uint_t id, int atype)
2067 {
2068 	nvlist_t **child;
2069 	uint_t children;
2070 	int error;
2071 
2072 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
2073 		return (error);
2074 
2075 	if ((*vdp)->vdev_ops->vdev_op_leaf)
2076 		return (0);
2077 
2078 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2079 	    &child, &children);
2080 
2081 	if (error == ENOENT)
2082 		return (0);
2083 
2084 	if (error) {
2085 		vdev_free(*vdp);
2086 		*vdp = NULL;
2087 		return (SET_ERROR(EINVAL));
2088 	}
2089 
2090 	for (int c = 0; c < children; c++) {
2091 		vdev_t *vd;
2092 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
2093 		    atype)) != 0) {
2094 			vdev_free(*vdp);
2095 			*vdp = NULL;
2096 			return (error);
2097 		}
2098 	}
2099 
2100 	ASSERT(*vdp != NULL);
2101 
2102 	return (0);
2103 }
2104 
2105 static boolean_t
spa_should_flush_logs_on_unload(spa_t * spa)2106 spa_should_flush_logs_on_unload(spa_t *spa)
2107 {
2108 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
2109 		return (B_FALSE);
2110 
2111 	if (!spa_writeable(spa))
2112 		return (B_FALSE);
2113 
2114 	if (!spa->spa_sync_on)
2115 		return (B_FALSE);
2116 
2117 	if (spa_state(spa) != POOL_STATE_EXPORTED)
2118 		return (B_FALSE);
2119 
2120 	if (zfs_keep_log_spacemaps_at_export)
2121 		return (B_FALSE);
2122 
2123 	return (B_TRUE);
2124 }
2125 
2126 /*
2127  * Opens a transaction that will set the flag that will instruct
2128  * spa_sync to attempt to flush all the metaslabs for that txg.
2129  */
2130 static void
spa_unload_log_sm_flush_all(spa_t * spa)2131 spa_unload_log_sm_flush_all(spa_t *spa)
2132 {
2133 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
2134 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND));
2135 
2136 	ASSERT0(spa->spa_log_flushall_txg);
2137 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
2138 
2139 	dmu_tx_commit(tx);
2140 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
2141 }
2142 
2143 static void
spa_unload_log_sm_metadata(spa_t * spa)2144 spa_unload_log_sm_metadata(spa_t *spa)
2145 {
2146 	void *cookie = NULL;
2147 	spa_log_sm_t *sls;
2148 	log_summary_entry_t *e;
2149 
2150 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
2151 	    &cookie)) != NULL) {
2152 		VERIFY0(sls->sls_mscount);
2153 		kmem_free(sls, sizeof (spa_log_sm_t));
2154 	}
2155 
2156 	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
2157 		VERIFY0(e->lse_mscount);
2158 		kmem_free(e, sizeof (log_summary_entry_t));
2159 	}
2160 
2161 	spa->spa_unflushed_stats.sus_nblocks = 0;
2162 	spa->spa_unflushed_stats.sus_memused = 0;
2163 	spa->spa_unflushed_stats.sus_blocklimit = 0;
2164 }
2165 
2166 static void
spa_destroy_aux_threads(spa_t * spa)2167 spa_destroy_aux_threads(spa_t *spa)
2168 {
2169 	if (spa->spa_condense_zthr != NULL) {
2170 		zthr_destroy(spa->spa_condense_zthr);
2171 		spa->spa_condense_zthr = NULL;
2172 	}
2173 	if (spa->spa_checkpoint_discard_zthr != NULL) {
2174 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
2175 		spa->spa_checkpoint_discard_zthr = NULL;
2176 	}
2177 	if (spa->spa_livelist_delete_zthr != NULL) {
2178 		zthr_destroy(spa->spa_livelist_delete_zthr);
2179 		spa->spa_livelist_delete_zthr = NULL;
2180 	}
2181 	if (spa->spa_livelist_condense_zthr != NULL) {
2182 		zthr_destroy(spa->spa_livelist_condense_zthr);
2183 		spa->spa_livelist_condense_zthr = NULL;
2184 	}
2185 	if (spa->spa_raidz_expand_zthr != NULL) {
2186 		zthr_destroy(spa->spa_raidz_expand_zthr);
2187 		spa->spa_raidz_expand_zthr = NULL;
2188 	}
2189 }
2190 
2191 static void
spa_sync_time_logger(spa_t * spa,uint64_t txg,boolean_t force)2192 spa_sync_time_logger(spa_t *spa, uint64_t txg, boolean_t force)
2193 {
2194 	uint64_t curtime, dirty;
2195 	dmu_tx_t *tx;
2196 	dsl_pool_t *dp = spa->spa_dsl_pool;
2197 	uint64_t idx = txg & TXG_MASK;
2198 
2199 	if (!spa_writeable(spa)) {
2200 		return;
2201 	}
2202 
2203 	curtime = gethrestime_sec();
2204 	if (txg > spa->spa_last_noted_txg &&
2205 	    (force ||
2206 	    curtime >= spa->spa_last_noted_txg_time + spa_note_txg_time)) {
2207 		spa->spa_last_noted_txg_time = curtime;
2208 		spa->spa_last_noted_txg = txg;
2209 
2210 		mutex_enter(&spa->spa_txg_log_time_lock);
2211 		dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
2212 		mutex_exit(&spa->spa_txg_log_time_lock);
2213 	}
2214 
2215 	if (!force &&
2216 	    curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
2217 		return;
2218 	}
2219 	if (txg > spa_final_dirty_txg(spa)) {
2220 		return;
2221 	}
2222 	spa->spa_last_flush_txg_time = curtime;
2223 
2224 	mutex_enter(&dp->dp_lock);
2225 	dirty = dp->dp_dirty_pertxg[idx];
2226 	mutex_exit(&dp->dp_lock);
2227 	if (!force && dirty == 0) {
2228 		return;
2229 	}
2230 
2231 	spa->spa_last_flush_txg_time = curtime;
2232 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2233 
2234 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
2235 	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2236 	    &spa->spa_txg_log_time.dbr_minutes, tx));
2237 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
2238 	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2239 	    &spa->spa_txg_log_time.dbr_days, tx));
2240 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
2241 	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2242 	    &spa->spa_txg_log_time.dbr_months, tx));
2243 	dmu_tx_commit(tx);
2244 }
2245 
2246 static void
spa_unload_sync_time_logger(spa_t * spa)2247 spa_unload_sync_time_logger(spa_t *spa)
2248 {
2249 	uint64_t txg;
2250 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
2251 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
2252 
2253 	txg = dmu_tx_get_txg(tx);
2254 	spa_sync_time_logger(spa, txg, B_TRUE);
2255 
2256 	dmu_tx_commit(tx);
2257 }
2258 
2259 static void
spa_load_txg_log_time(spa_t * spa)2260 spa_load_txg_log_time(spa_t *spa)
2261 {
2262 	int error;
2263 
2264 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2265 	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2266 	    &spa->spa_txg_log_time.dbr_minutes);
2267 	if (error != 0 && error != ENOENT) {
2268 		spa_load_note(spa, "unable to load a txg time database with "
2269 		    "minute resolution [error=%d]", error);
2270 	}
2271 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2272 	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2273 	    &spa->spa_txg_log_time.dbr_days);
2274 	if (error != 0 && error != ENOENT) {
2275 		spa_load_note(spa, "unable to load a txg time database with "
2276 		    "day resolution [error=%d]", error);
2277 	}
2278 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2279 	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
2280 	    &spa->spa_txg_log_time.dbr_months);
2281 	if (error != 0 && error != ENOENT) {
2282 		spa_load_note(spa, "unable to load a txg time database with "
2283 		    "month resolution [error=%d]", error);
2284 	}
2285 }
2286 
2287 static boolean_t
spa_should_sync_time_logger_on_unload(spa_t * spa)2288 spa_should_sync_time_logger_on_unload(spa_t *spa)
2289 {
2290 
2291 	if (!spa_writeable(spa))
2292 		return (B_FALSE);
2293 
2294 	if (!spa->spa_sync_on)
2295 		return (B_FALSE);
2296 
2297 	if (spa_state(spa) != POOL_STATE_EXPORTED)
2298 		return (B_FALSE);
2299 
2300 	if (spa->spa_last_noted_txg == 0)
2301 		return (B_FALSE);
2302 
2303 	return (B_TRUE);
2304 }
2305 
2306 
2307 /*
2308  * Opposite of spa_load().
2309  */
2310 static void
spa_unload(spa_t * spa)2311 spa_unload(spa_t *spa)
2312 {
2313 	ASSERT(spa_namespace_held() ||
2314 	    spa->spa_export_thread == curthread);
2315 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
2316 
2317 	spa_import_progress_remove(spa_guid(spa));
2318 	spa_load_note(spa, "UNLOADING");
2319 
2320 	spa_wake_waiters(spa);
2321 
2322 	/*
2323 	 * If we have set the spa_final_txg, we have already performed the
2324 	 * tasks below in spa_export_common(). We should not redo it here since
2325 	 * we delay the final TXGs beyond what spa_final_txg is set at.
2326 	 */
2327 	if (spa->spa_final_txg == UINT64_MAX) {
2328 		if (spa_should_sync_time_logger_on_unload(spa))
2329 			spa_unload_sync_time_logger(spa);
2330 
2331 		/*
2332 		 * If the log space map feature is enabled and the pool is
2333 		 * getting exported (but not destroyed), we want to spend some
2334 		 * time flushing as many metaslabs as we can in an attempt to
2335 		 * destroy log space maps and save import time.
2336 		 */
2337 		if (spa_should_flush_logs_on_unload(spa))
2338 			spa_unload_log_sm_flush_all(spa);
2339 
2340 		/*
2341 		 * Stop async tasks.
2342 		 */
2343 		spa_async_suspend(spa);
2344 
2345 		if (spa->spa_root_vdev) {
2346 			vdev_t *root_vdev = spa->spa_root_vdev;
2347 			vdev_initialize_stop_all(root_vdev,
2348 			    VDEV_INITIALIZE_ACTIVE);
2349 			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
2350 			vdev_autotrim_stop_all(spa);
2351 			vdev_rebuild_stop_all(spa);
2352 			l2arc_spa_rebuild_stop(spa);
2353 		}
2354 
2355 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2356 		spa->spa_final_txg = spa_last_synced_txg(spa) +
2357 		    TXG_DEFER_SIZE + 1;
2358 		spa_config_exit(spa, SCL_ALL, FTAG);
2359 	}
2360 
2361 	/*
2362 	 * Stop syncing.
2363 	 */
2364 	if (spa->spa_sync_on) {
2365 		txg_sync_stop(spa->spa_dsl_pool);
2366 		spa->spa_sync_on = B_FALSE;
2367 	}
2368 
2369 	/*
2370 	 * This ensures that there is no async metaslab prefetching
2371 	 * while we attempt to unload the spa.
2372 	 */
2373 	taskq_wait(spa->spa_metaslab_taskq);
2374 
2375 	if (spa->spa_mmp.mmp_thread)
2376 		mmp_thread_stop(spa);
2377 
2378 	/*
2379 	 * Wait for any outstanding async I/O to complete.
2380 	 */
2381 	if (spa->spa_async_zio_root != NULL) {
2382 		for (int i = 0; i < max_ncpus; i++)
2383 			(void) zio_wait(spa->spa_async_zio_root[i]);
2384 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
2385 		spa->spa_async_zio_root = NULL;
2386 	}
2387 
2388 	if (spa->spa_vdev_removal != NULL) {
2389 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
2390 		spa->spa_vdev_removal = NULL;
2391 	}
2392 
2393 	spa_destroy_aux_threads(spa);
2394 
2395 	spa_condense_fini(spa);
2396 
2397 	bpobj_close(&spa->spa_deferred_bpobj);
2398 
2399 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
2400 
2401 	/*
2402 	 * Close all vdevs.
2403 	 */
2404 	if (spa->spa_root_vdev)
2405 		vdev_free(spa->spa_root_vdev);
2406 	ASSERT0P(spa->spa_root_vdev);
2407 
2408 	/*
2409 	 * Close the dsl pool.
2410 	 */
2411 	if (spa->spa_dsl_pool) {
2412 		dsl_pool_close(spa->spa_dsl_pool);
2413 		spa->spa_dsl_pool = NULL;
2414 		spa->spa_meta_objset = NULL;
2415 	}
2416 
2417 	ddt_unload(spa);
2418 	brt_unload(spa);
2419 	spa_unload_log_sm_metadata(spa);
2420 
2421 	/*
2422 	 * Drop and purge level 2 cache
2423 	 */
2424 	spa_l2cache_drop(spa);
2425 
2426 	if (spa->spa_spares.sav_vdevs) {
2427 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
2428 			vdev_free(spa->spa_spares.sav_vdevs[i]);
2429 		kmem_free(spa->spa_spares.sav_vdevs,
2430 		    spa->spa_spares.sav_count * sizeof (void *));
2431 		spa->spa_spares.sav_vdevs = NULL;
2432 	}
2433 	if (spa->spa_spares.sav_config) {
2434 		nvlist_free(spa->spa_spares.sav_config);
2435 		spa->spa_spares.sav_config = NULL;
2436 	}
2437 	spa->spa_spares.sav_count = 0;
2438 
2439 	if (spa->spa_l2cache.sav_vdevs) {
2440 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
2441 			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
2442 			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
2443 		}
2444 		kmem_free(spa->spa_l2cache.sav_vdevs,
2445 		    spa->spa_l2cache.sav_count * sizeof (void *));
2446 		spa->spa_l2cache.sav_vdevs = NULL;
2447 	}
2448 	if (spa->spa_l2cache.sav_config) {
2449 		nvlist_free(spa->spa_l2cache.sav_config);
2450 		spa->spa_l2cache.sav_config = NULL;
2451 	}
2452 	spa->spa_l2cache.sav_count = 0;
2453 
2454 	spa->spa_async_suspended = 0;
2455 
2456 	spa->spa_indirect_vdevs_loaded = B_FALSE;
2457 
2458 	if (spa->spa_comment != NULL) {
2459 		spa_strfree(spa->spa_comment);
2460 		spa->spa_comment = NULL;
2461 	}
2462 	if (spa->spa_compatibility != NULL) {
2463 		spa_strfree(spa->spa_compatibility);
2464 		spa->spa_compatibility = NULL;
2465 	}
2466 
2467 	spa->spa_raidz_expand = NULL;
2468 	spa->spa_checkpoint_txg = 0;
2469 
2470 	spa_config_exit(spa, SCL_ALL, spa);
2471 }
2472 
2473 /*
2474  * Load (or re-load) the current list of vdevs describing the active spares for
2475  * this pool.  When this is called, we have some form of basic information in
2476  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
2477  * then re-generate a more complete list including status information.
2478  */
2479 void
spa_load_spares(spa_t * spa)2480 spa_load_spares(spa_t *spa)
2481 {
2482 	nvlist_t **spares;
2483 	uint_t nspares;
2484 	int i;
2485 	vdev_t *vd, *tvd;
2486 
2487 #ifndef _KERNEL
2488 	/*
2489 	 * zdb opens both the current state of the pool and the
2490 	 * checkpointed state (if present), with a different spa_t.
2491 	 *
2492 	 * As spare vdevs are shared among open pools, we skip loading
2493 	 * them when we load the checkpointed state of the pool.
2494 	 */
2495 	if (!spa_writeable(spa))
2496 		return;
2497 #endif
2498 
2499 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2500 
2501 	/*
2502 	 * First, close and free any existing spare vdevs.
2503 	 */
2504 	if (spa->spa_spares.sav_vdevs) {
2505 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
2506 			vd = spa->spa_spares.sav_vdevs[i];
2507 
2508 			/* Undo the call to spa_activate() below */
2509 			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
2510 			    B_FALSE)) != NULL && tvd->vdev_isspare)
2511 				spa_spare_remove(tvd);
2512 			vdev_close(vd);
2513 			vdev_free(vd);
2514 		}
2515 
2516 		kmem_free(spa->spa_spares.sav_vdevs,
2517 		    spa->spa_spares.sav_count * sizeof (void *));
2518 	}
2519 
2520 	if (spa->spa_spares.sav_config == NULL)
2521 		nspares = 0;
2522 	else
2523 		VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2524 		    ZPOOL_CONFIG_SPARES, &spares, &nspares));
2525 
2526 	spa->spa_spares.sav_count = (int)nspares;
2527 	spa->spa_spares.sav_vdevs = NULL;
2528 
2529 	if (nspares == 0)
2530 		return;
2531 
2532 	/*
2533 	 * Construct the array of vdevs, opening them to get status in the
2534 	 * process.   For each spare, there is potentially two different vdev_t
2535 	 * structures associated with it: one in the list of spares (used only
2536 	 * for basic validation purposes) and one in the active vdev
2537 	 * configuration (if it's spared in).  During this phase we open and
2538 	 * validate each vdev on the spare list.  If the vdev also exists in the
2539 	 * active configuration, then we also mark this vdev as an active spare.
2540 	 */
2541 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
2542 	    KM_SLEEP);
2543 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
2544 		VERIFY0(spa_config_parse(spa, &vd, spares[i], NULL, 0,
2545 		    VDEV_ALLOC_SPARE));
2546 		ASSERT(vd != NULL);
2547 
2548 		spa->spa_spares.sav_vdevs[i] = vd;
2549 
2550 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
2551 		    B_FALSE)) != NULL) {
2552 			if (!tvd->vdev_isspare)
2553 				spa_spare_add(tvd);
2554 
2555 			/*
2556 			 * We only mark the spare active if we were successfully
2557 			 * able to load the vdev.  Otherwise, importing a pool
2558 			 * with a bad active spare would result in strange
2559 			 * behavior, because multiple pool would think the spare
2560 			 * is actively in use.
2561 			 *
2562 			 * There is a vulnerability here to an equally bizarre
2563 			 * circumstance, where a dead active spare is later
2564 			 * brought back to life (onlined or otherwise).  Given
2565 			 * the rarity of this scenario, and the extra complexity
2566 			 * it adds, we ignore the possibility.
2567 			 */
2568 			if (!vdev_is_dead(tvd))
2569 				spa_spare_activate(tvd);
2570 		}
2571 
2572 		vd->vdev_top = vd;
2573 		vd->vdev_aux = &spa->spa_spares;
2574 
2575 		if (vdev_open(vd) != 0)
2576 			continue;
2577 
2578 		if (vdev_validate_aux(vd) == 0)
2579 			spa_spare_add(vd);
2580 	}
2581 
2582 	/*
2583 	 * Recompute the stashed list of spares, with status information
2584 	 * this time.
2585 	 */
2586 	fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
2587 
2588 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
2589 	    KM_SLEEP);
2590 	for (i = 0; i < spa->spa_spares.sav_count; i++)
2591 		spares[i] = vdev_config_generate(spa,
2592 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
2593 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
2594 	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
2595 	    spa->spa_spares.sav_count);
2596 	for (i = 0; i < spa->spa_spares.sav_count; i++)
2597 		nvlist_free(spares[i]);
2598 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
2599 }
2600 
2601 /*
2602  * Load (or re-load) the current list of vdevs describing the active l2cache for
2603  * this pool.  When this is called, we have some form of basic information in
2604  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
2605  * then re-generate a more complete list including status information.
2606  * Devices which are already active have their details maintained, and are
2607  * not re-opened.
2608  */
2609 void
spa_load_l2cache(spa_t * spa)2610 spa_load_l2cache(spa_t *spa)
2611 {
2612 	nvlist_t **l2cache = NULL;
2613 	uint_t nl2cache;
2614 	int i, j, oldnvdevs;
2615 	uint64_t guid;
2616 	vdev_t *vd, **oldvdevs, **newvdevs;
2617 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2618 
2619 #ifndef _KERNEL
2620 	/*
2621 	 * zdb opens both the current state of the pool and the
2622 	 * checkpointed state (if present), with a different spa_t.
2623 	 *
2624 	 * As L2 caches are part of the ARC which is shared among open
2625 	 * pools, we skip loading them when we load the checkpointed
2626 	 * state of the pool.
2627 	 */
2628 	if (!spa_writeable(spa))
2629 		return;
2630 #endif
2631 
2632 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2633 
2634 	oldvdevs = sav->sav_vdevs;
2635 	oldnvdevs = sav->sav_count;
2636 	sav->sav_vdevs = NULL;
2637 	sav->sav_count = 0;
2638 
2639 	if (sav->sav_config == NULL) {
2640 		nl2cache = 0;
2641 		newvdevs = NULL;
2642 		goto out;
2643 	}
2644 
2645 	VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
2646 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
2647 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
2648 
2649 	/*
2650 	 * Process new nvlist of vdevs.
2651 	 */
2652 	for (i = 0; i < nl2cache; i++) {
2653 		guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
2654 
2655 		newvdevs[i] = NULL;
2656 		for (j = 0; j < oldnvdevs; j++) {
2657 			vd = oldvdevs[j];
2658 			if (vd != NULL && guid == vd->vdev_guid) {
2659 				/*
2660 				 * Retain previous vdev for add/remove ops.
2661 				 */
2662 				newvdevs[i] = vd;
2663 				oldvdevs[j] = NULL;
2664 				break;
2665 			}
2666 		}
2667 
2668 		if (newvdevs[i] == NULL) {
2669 			/*
2670 			 * Create new vdev
2671 			 */
2672 			VERIFY0(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
2673 			    VDEV_ALLOC_L2CACHE));
2674 			ASSERT(vd != NULL);
2675 			newvdevs[i] = vd;
2676 
2677 			/*
2678 			 * Commit this vdev as an l2cache device,
2679 			 * even if it fails to open.
2680 			 */
2681 			spa_l2cache_add(vd);
2682 
2683 			vd->vdev_top = vd;
2684 			vd->vdev_aux = sav;
2685 
2686 			spa_l2cache_activate(vd);
2687 
2688 			if (vdev_open(vd) != 0)
2689 				continue;
2690 
2691 			(void) vdev_validate_aux(vd);
2692 
2693 			if (!vdev_is_dead(vd))
2694 				l2arc_add_vdev(spa, vd);
2695 
2696 			/*
2697 			 * Upon cache device addition to a pool or pool
2698 			 * creation with a cache device or if the header
2699 			 * of the device is invalid we issue an async
2700 			 * TRIM command for the whole device which will
2701 			 * execute if l2arc_trim_ahead > 0.
2702 			 */
2703 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2704 		}
2705 	}
2706 
2707 	sav->sav_vdevs = newvdevs;
2708 	sav->sav_count = (int)nl2cache;
2709 
2710 	/*
2711 	 * Recompute the stashed list of l2cache devices, with status
2712 	 * information this time.
2713 	 */
2714 	fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
2715 
2716 	if (sav->sav_count > 0)
2717 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
2718 		    KM_SLEEP);
2719 	for (i = 0; i < sav->sav_count; i++)
2720 		l2cache[i] = vdev_config_generate(spa,
2721 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
2722 	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2723 	    (const nvlist_t * const *)l2cache, sav->sav_count);
2724 
2725 out:
2726 	/*
2727 	 * Purge vdevs that were dropped
2728 	 */
2729 	if (oldvdevs) {
2730 		for (i = 0; i < oldnvdevs; i++) {
2731 			uint64_t pool;
2732 
2733 			vd = oldvdevs[i];
2734 			if (vd != NULL) {
2735 				ASSERT(vd->vdev_isl2cache);
2736 
2737 				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2738 				    pool != 0ULL && l2arc_vdev_present(vd))
2739 					l2arc_remove_vdev(vd);
2740 				vdev_clear_stats(vd);
2741 				vdev_free(vd);
2742 			}
2743 		}
2744 
2745 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
2746 	}
2747 
2748 	for (i = 0; i < sav->sav_count; i++)
2749 		nvlist_free(l2cache[i]);
2750 	if (sav->sav_count)
2751 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
2752 }
2753 
2754 static int
load_nvlist(spa_t * spa,uint64_t obj,nvlist_t ** value)2755 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
2756 {
2757 	dmu_buf_t *db;
2758 	char *packed = NULL;
2759 	size_t nvsize = 0;
2760 	int error;
2761 	*value = NULL;
2762 
2763 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
2764 	if (error)
2765 		return (error);
2766 
2767 	nvsize = *(uint64_t *)db->db_data;
2768 	dmu_buf_rele(db, FTAG);
2769 
2770 	packed = vmem_alloc(nvsize, KM_SLEEP);
2771 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
2772 	    DMU_READ_PREFETCH);
2773 	if (error == 0)
2774 		error = nvlist_unpack(packed, nvsize, value, 0);
2775 	vmem_free(packed, nvsize);
2776 
2777 	return (error);
2778 }
2779 
2780 /*
2781  * Concrete top-level vdevs that are not missing and are not logs. At every
2782  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
2783  */
2784 static uint64_t
spa_healthy_core_tvds(spa_t * spa)2785 spa_healthy_core_tvds(spa_t *spa)
2786 {
2787 	vdev_t *rvd = spa->spa_root_vdev;
2788 	uint64_t tvds = 0;
2789 
2790 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
2791 		vdev_t *vd = rvd->vdev_child[i];
2792 		if (vd->vdev_islog)
2793 			continue;
2794 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
2795 			tvds++;
2796 	}
2797 
2798 	return (tvds);
2799 }
2800 
2801 /*
2802  * Checks to see if the given vdev could not be opened, in which case we post a
2803  * sysevent to notify the autoreplace code that the device has been removed.
2804  */
2805 static void
spa_check_removed(vdev_t * vd)2806 spa_check_removed(vdev_t *vd)
2807 {
2808 	for (uint64_t c = 0; c < vd->vdev_children; c++)
2809 		spa_check_removed(vd->vdev_child[c]);
2810 
2811 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
2812 	    vdev_is_concrete(vd)) {
2813 		zfs_post_autoreplace(vd->vdev_spa, vd);
2814 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
2815 	}
2816 }
2817 
2818 static int
spa_check_for_missing_logs(spa_t * spa)2819 spa_check_for_missing_logs(spa_t *spa)
2820 {
2821 	vdev_t *rvd = spa->spa_root_vdev;
2822 
2823 	/*
2824 	 * If we're doing a normal import, then build up any additional
2825 	 * diagnostic information about missing log devices.
2826 	 * We'll pass this up to the user for further processing.
2827 	 */
2828 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
2829 		nvlist_t **child, *nv;
2830 		uint64_t idx = 0;
2831 
2832 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
2833 		    KM_SLEEP);
2834 		nv = fnvlist_alloc();
2835 
2836 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2837 			vdev_t *tvd = rvd->vdev_child[c];
2838 
2839 			/*
2840 			 * We consider a device as missing only if it failed
2841 			 * to open (i.e. offline or faulted is not considered
2842 			 * as missing).
2843 			 */
2844 			if (tvd->vdev_islog &&
2845 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2846 				child[idx++] = vdev_config_generate(spa, tvd,
2847 				    B_FALSE, VDEV_CONFIG_MISSING);
2848 			}
2849 		}
2850 
2851 		if (idx > 0) {
2852 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2853 			    (const nvlist_t * const *)child, idx);
2854 			fnvlist_add_nvlist(spa->spa_load_info,
2855 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
2856 
2857 			for (uint64_t i = 0; i < idx; i++)
2858 				nvlist_free(child[i]);
2859 		}
2860 		nvlist_free(nv);
2861 		kmem_free(child, rvd->vdev_children * sizeof (char **));
2862 
2863 		if (idx > 0) {
2864 			spa_load_failed(spa, "some log devices are missing");
2865 			vdev_dbgmsg_print_tree(rvd, 2);
2866 			return (SET_ERROR(ENXIO));
2867 		}
2868 	} else {
2869 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2870 			vdev_t *tvd = rvd->vdev_child[c];
2871 
2872 			if (tvd->vdev_islog &&
2873 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2874 				spa_set_log_state(spa, SPA_LOG_CLEAR);
2875 				spa_load_note(spa, "some log devices are "
2876 				    "missing, ZIL is dropped.");
2877 				vdev_dbgmsg_print_tree(rvd, 2);
2878 				break;
2879 			}
2880 		}
2881 	}
2882 
2883 	return (0);
2884 }
2885 
2886 /*
2887  * Check for missing log devices
2888  */
2889 static boolean_t
spa_check_logs(spa_t * spa)2890 spa_check_logs(spa_t *spa)
2891 {
2892 	boolean_t rv = B_FALSE;
2893 	dsl_pool_t *dp = spa_get_dsl(spa);
2894 
2895 	switch (spa->spa_log_state) {
2896 	default:
2897 		break;
2898 	case SPA_LOG_MISSING:
2899 		/* need to recheck in case slog has been restored */
2900 	case SPA_LOG_UNKNOWN:
2901 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2902 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
2903 		if (rv)
2904 			spa_set_log_state(spa, SPA_LOG_MISSING);
2905 		break;
2906 	}
2907 	return (rv);
2908 }
2909 
2910 /*
2911  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
2912  */
2913 static boolean_t
spa_passivate_log(spa_t * spa)2914 spa_passivate_log(spa_t *spa)
2915 {
2916 	vdev_t *rvd = spa->spa_root_vdev;
2917 	boolean_t slog_found = B_FALSE;
2918 
2919 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2920 
2921 	for (int c = 0; c < rvd->vdev_children; c++) {
2922 		vdev_t *tvd = rvd->vdev_child[c];
2923 
2924 		if (tvd->vdev_islog) {
2925 			ASSERT0P(tvd->vdev_log_mg);
2926 			metaslab_group_passivate(tvd->vdev_mg);
2927 			slog_found = B_TRUE;
2928 		}
2929 	}
2930 
2931 	return (slog_found);
2932 }
2933 
2934 /*
2935  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
2936  */
2937 static void
spa_activate_log(spa_t * spa)2938 spa_activate_log(spa_t *spa)
2939 {
2940 	vdev_t *rvd = spa->spa_root_vdev;
2941 
2942 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2943 
2944 	for (int c = 0; c < rvd->vdev_children; c++) {
2945 		vdev_t *tvd = rvd->vdev_child[c];
2946 
2947 		if (tvd->vdev_islog) {
2948 			ASSERT0P(tvd->vdev_log_mg);
2949 			metaslab_group_activate(tvd->vdev_mg);
2950 		}
2951 	}
2952 }
2953 
2954 int
spa_reset_logs(spa_t * spa)2955 spa_reset_logs(spa_t *spa)
2956 {
2957 	int error;
2958 
2959 	error = dmu_objset_find(spa_name(spa), zil_reset,
2960 	    NULL, DS_FIND_CHILDREN);
2961 	if (error == 0) {
2962 		/*
2963 		 * We successfully offlined the log device, sync out the
2964 		 * current txg so that the "stubby" block can be removed
2965 		 * by zil_sync().
2966 		 */
2967 		txg_wait_synced(spa->spa_dsl_pool, 0);
2968 	}
2969 	return (error);
2970 }
2971 
2972 static void
spa_aux_check_removed(spa_aux_vdev_t * sav)2973 spa_aux_check_removed(spa_aux_vdev_t *sav)
2974 {
2975 	for (int i = 0; i < sav->sav_count; i++)
2976 		spa_check_removed(sav->sav_vdevs[i]);
2977 }
2978 
2979 void
spa_claim_notify(zio_t * zio)2980 spa_claim_notify(zio_t *zio)
2981 {
2982 	spa_t *spa = zio->io_spa;
2983 
2984 	if (zio->io_error)
2985 		return;
2986 
2987 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
2988 	if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp))
2989 		spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp);
2990 	mutex_exit(&spa->spa_props_lock);
2991 }
2992 
2993 typedef struct spa_load_error {
2994 	boolean_t	sle_verify_data;
2995 	uint64_t	sle_meta_count;
2996 	uint64_t	sle_data_count;
2997 } spa_load_error_t;
2998 
2999 static void
spa_load_verify_done(zio_t * zio)3000 spa_load_verify_done(zio_t *zio)
3001 {
3002 	blkptr_t *bp = zio->io_bp;
3003 	spa_load_error_t *sle = zio->io_private;
3004 	dmu_object_type_t type = BP_GET_TYPE(bp);
3005 	int error = zio->io_error;
3006 	spa_t *spa = zio->io_spa;
3007 
3008 	abd_free(zio->io_abd);
3009 	if (error) {
3010 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
3011 		    type != DMU_OT_INTENT_LOG)
3012 			atomic_inc_64(&sle->sle_meta_count);
3013 		else
3014 			atomic_inc_64(&sle->sle_data_count);
3015 	}
3016 
3017 	mutex_enter(&spa->spa_scrub_lock);
3018 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
3019 	cv_broadcast(&spa->spa_scrub_io_cv);
3020 	mutex_exit(&spa->spa_scrub_lock);
3021 }
3022 
3023 /*
3024  * Maximum number of inflight bytes is the log2 fraction of the arc size.
3025  * By default, we set it to 1/16th of the arc.
3026  */
3027 static uint_t spa_load_verify_shift = 4;
3028 static int spa_load_verify_metadata = B_TRUE;
3029 static int spa_load_verify_data = B_TRUE;
3030 
3031 static int
spa_load_verify_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)3032 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
3033     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
3034 {
3035 	zio_t *rio = arg;
3036 	spa_load_error_t *sle = rio->io_private;
3037 
3038 	(void) zilog, (void) dnp;
3039 
3040 	/*
3041 	 * Note: normally this routine will not be called if
3042 	 * spa_load_verify_metadata is not set.  However, it may be useful
3043 	 * to manually set the flag after the traversal has begun.
3044 	 */
3045 	if (!spa_load_verify_metadata)
3046 		return (0);
3047 
3048 	/*
3049 	 * Sanity check the block pointer in order to detect obvious damage
3050 	 * before using the contents in subsequent checks or in zio_read().
3051 	 * When damaged consider it to be a metadata error since we cannot
3052 	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
3053 	 */
3054 	if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
3055 		atomic_inc_64(&sle->sle_meta_count);
3056 		return (0);
3057 	}
3058 
3059 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
3060 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
3061 		return (0);
3062 
3063 	if (!BP_IS_METADATA(bp) &&
3064 	    (!spa_load_verify_data || !sle->sle_verify_data))
3065 		return (0);
3066 
3067 	uint64_t maxinflight_bytes =
3068 	    arc_target_bytes() >> spa_load_verify_shift;
3069 	size_t size = BP_GET_PSIZE(bp);
3070 
3071 	mutex_enter(&spa->spa_scrub_lock);
3072 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
3073 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3074 	spa->spa_load_verify_bytes += size;
3075 	mutex_exit(&spa->spa_scrub_lock);
3076 
3077 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
3078 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
3079 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
3080 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
3081 	return (0);
3082 }
3083 
3084 static int
verify_dataset_name_len(dsl_pool_t * dp,dsl_dataset_t * ds,void * arg)3085 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
3086 {
3087 	(void) dp, (void) arg;
3088 
3089 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
3090 		return (SET_ERROR(ENAMETOOLONG));
3091 
3092 	return (0);
3093 }
3094 
3095 static int
spa_load_verify(spa_t * spa)3096 spa_load_verify(spa_t *spa)
3097 {
3098 	zio_t *rio;
3099 	spa_load_error_t sle = { 0 };
3100 	zpool_load_policy_t policy;
3101 	boolean_t verify_ok = B_FALSE;
3102 	int error = 0;
3103 
3104 	zpool_get_load_policy(spa->spa_config, &policy);
3105 
3106 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
3107 	    policy.zlp_maxmeta == UINT64_MAX)
3108 		return (0);
3109 
3110 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
3111 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
3112 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
3113 	    DS_FIND_CHILDREN);
3114 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
3115 	if (error != 0)
3116 		return (error);
3117 
3118 	/*
3119 	 * Verify data only if we are rewinding or error limit was set.
3120 	 * Otherwise nothing except dbgmsg care about it to waste time.
3121 	 */
3122 	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
3123 	    (policy.zlp_maxdata < UINT64_MAX);
3124 
3125 	rio = zio_root(spa, NULL, &sle,
3126 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
3127 
3128 	if (spa_load_verify_metadata) {
3129 		if (spa->spa_extreme_rewind) {
3130 			spa_load_note(spa, "performing a complete scan of the "
3131 			    "pool since extreme rewind is on. This may take "
3132 			    "a very long time.\n  (spa_load_verify_data=%u, "
3133 			    "spa_load_verify_metadata=%u)",
3134 			    spa_load_verify_data, spa_load_verify_metadata);
3135 		}
3136 
3137 		error = traverse_pool(spa, spa->spa_verify_min_txg,
3138 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
3139 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
3140 	}
3141 
3142 	(void) zio_wait(rio);
3143 	ASSERT0(spa->spa_load_verify_bytes);
3144 
3145 	spa->spa_load_meta_errors = sle.sle_meta_count;
3146 	spa->spa_load_data_errors = sle.sle_data_count;
3147 
3148 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
3149 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
3150 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
3151 		    (u_longlong_t)sle.sle_data_count);
3152 	}
3153 
3154 	if (spa_load_verify_dryrun ||
3155 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
3156 	    sle.sle_data_count <= policy.zlp_maxdata)) {
3157 		int64_t loss = 0;
3158 
3159 		verify_ok = B_TRUE;
3160 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
3161 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
3162 
3163 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
3164 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
3165 		    spa->spa_load_txg_ts);
3166 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
3167 		    loss);
3168 		fnvlist_add_uint64(spa->spa_load_info,
3169 		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
3170 		fnvlist_add_uint64(spa->spa_load_info,
3171 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
3172 	} else {
3173 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
3174 	}
3175 
3176 	if (spa_load_verify_dryrun)
3177 		return (0);
3178 
3179 	if (error) {
3180 		if (error != ENXIO && error != EIO)
3181 			error = SET_ERROR(EIO);
3182 		return (error);
3183 	}
3184 
3185 	return (verify_ok ? 0 : EIO);
3186 }
3187 
3188 /*
3189  * Find a value in the pool props object.
3190  */
3191 static void
spa_prop_find(spa_t * spa,zpool_prop_t prop,uint64_t * val)3192 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
3193 {
3194 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
3195 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
3196 }
3197 
3198 /*
3199  * Find a value in the pool directory object.
3200  */
3201 static int
spa_dir_prop(spa_t * spa,const char * name,uint64_t * val,boolean_t log_enoent)3202 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
3203 {
3204 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3205 	    name, sizeof (uint64_t), 1, val);
3206 
3207 	if (error != 0 && (error != ENOENT || log_enoent)) {
3208 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
3209 		    "[error=%d]", name, error);
3210 	}
3211 
3212 	return (error);
3213 }
3214 
3215 static int
spa_vdev_err(vdev_t * vdev,vdev_aux_t aux,int err)3216 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
3217 {
3218 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
3219 	return (SET_ERROR(err));
3220 }
3221 
3222 boolean_t
spa_livelist_delete_check(spa_t * spa)3223 spa_livelist_delete_check(spa_t *spa)
3224 {
3225 	return (spa->spa_livelists_to_delete != 0);
3226 }
3227 
3228 static boolean_t
spa_livelist_delete_cb_check(void * arg,zthr_t * z)3229 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
3230 {
3231 	(void) z;
3232 	spa_t *spa = arg;
3233 	return (spa_livelist_delete_check(spa));
3234 }
3235 
3236 static int
delete_blkptr_cb(void * arg,const blkptr_t * bp,dmu_tx_t * tx)3237 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3238 {
3239 	spa_t *spa = arg;
3240 	zio_free(spa, tx->tx_txg, bp);
3241 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
3242 	    -bp_get_dsize_sync(spa, bp),
3243 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
3244 	return (0);
3245 }
3246 
3247 static int
dsl_get_next_livelist_obj(objset_t * os,uint64_t zap_obj,uint64_t * llp)3248 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
3249 {
3250 	int err;
3251 	zap_cursor_t zc;
3252 	zap_attribute_t *za = zap_attribute_alloc();
3253 	zap_cursor_init(&zc, os, zap_obj);
3254 	err = zap_cursor_retrieve(&zc, za);
3255 	zap_cursor_fini(&zc);
3256 	if (err == 0)
3257 		*llp = za->za_first_integer;
3258 	zap_attribute_free(za);
3259 	return (err);
3260 }
3261 
3262 /*
3263  * Components of livelist deletion that must be performed in syncing
3264  * context: freeing block pointers and updating the pool-wide data
3265  * structures to indicate how much work is left to do
3266  */
3267 typedef struct sublist_delete_arg {
3268 	spa_t *spa;
3269 	dsl_deadlist_t *ll;
3270 	uint64_t key;
3271 	bplist_t *to_free;
3272 } sublist_delete_arg_t;
3273 
3274 static void
sublist_delete_sync(void * arg,dmu_tx_t * tx)3275 sublist_delete_sync(void *arg, dmu_tx_t *tx)
3276 {
3277 	sublist_delete_arg_t *sda = arg;
3278 	spa_t *spa = sda->spa;
3279 	dsl_deadlist_t *ll = sda->ll;
3280 	uint64_t key = sda->key;
3281 	bplist_t *to_free = sda->to_free;
3282 
3283 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
3284 	dsl_deadlist_remove_entry(ll, key, tx);
3285 }
3286 
3287 typedef struct livelist_delete_arg {
3288 	spa_t *spa;
3289 	uint64_t ll_obj;
3290 	uint64_t zap_obj;
3291 } livelist_delete_arg_t;
3292 
3293 static void
livelist_delete_sync(void * arg,dmu_tx_t * tx)3294 livelist_delete_sync(void *arg, dmu_tx_t *tx)
3295 {
3296 	livelist_delete_arg_t *lda = arg;
3297 	spa_t *spa = lda->spa;
3298 	uint64_t ll_obj = lda->ll_obj;
3299 	uint64_t zap_obj = lda->zap_obj;
3300 	objset_t *mos = spa->spa_meta_objset;
3301 	uint64_t count;
3302 
3303 	/* free the livelist and decrement the feature count */
3304 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
3305 	dsl_deadlist_free(mos, ll_obj, tx);
3306 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
3307 	VERIFY0(zap_count(mos, zap_obj, &count));
3308 	if (count == 0) {
3309 		/* no more livelists to delete */
3310 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
3311 		    DMU_POOL_DELETED_CLONES, tx));
3312 		VERIFY0(zap_destroy(mos, zap_obj, tx));
3313 		spa->spa_livelists_to_delete = 0;
3314 		spa_notify_waiters(spa);
3315 	}
3316 }
3317 
3318 /*
3319  * Load in the value for the livelist to be removed and open it. Then,
3320  * load its first sublist and determine which block pointers should actually
3321  * be freed. Then, call a synctask which performs the actual frees and updates
3322  * the pool-wide livelist data.
3323  */
3324 static void
spa_livelist_delete_cb(void * arg,zthr_t * z)3325 spa_livelist_delete_cb(void *arg, zthr_t *z)
3326 {
3327 	spa_t *spa = arg;
3328 	uint64_t ll_obj = 0, count;
3329 	objset_t *mos = spa->spa_meta_objset;
3330 	uint64_t zap_obj = spa->spa_livelists_to_delete;
3331 	/*
3332 	 * Determine the next livelist to delete. This function should only
3333 	 * be called if there is at least one deleted clone.
3334 	 */
3335 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
3336 	VERIFY0(zap_count(mos, ll_obj, &count));
3337 	if (count > 0) {
3338 		dsl_deadlist_t *ll;
3339 		dsl_deadlist_entry_t *dle;
3340 		bplist_t to_free;
3341 		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
3342 		VERIFY0(dsl_deadlist_open(ll, mos, ll_obj));
3343 		dle = dsl_deadlist_first(ll);
3344 		ASSERT3P(dle, !=, NULL);
3345 		bplist_create(&to_free);
3346 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
3347 		    z, NULL);
3348 		if (err == 0) {
3349 			sublist_delete_arg_t sync_arg = {
3350 			    .spa = spa,
3351 			    .ll = ll,
3352 			    .key = dle->dle_mintxg,
3353 			    .to_free = &to_free
3354 			};
3355 			zfs_dbgmsg("deleting sublist (id %llu) from"
3356 			    " livelist %llu, %lld remaining",
3357 			    (u_longlong_t)dle->dle_bpobj.bpo_object,
3358 			    (u_longlong_t)ll_obj, (longlong_t)count - 1);
3359 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
3360 			    sublist_delete_sync, &sync_arg, 0,
3361 			    ZFS_SPACE_CHECK_DESTROY));
3362 		} else {
3363 			VERIFY3U(err, ==, EINTR);
3364 		}
3365 		bplist_clear(&to_free);
3366 		bplist_destroy(&to_free);
3367 		dsl_deadlist_close(ll);
3368 		kmem_free(ll, sizeof (dsl_deadlist_t));
3369 	} else {
3370 		livelist_delete_arg_t sync_arg = {
3371 		    .spa = spa,
3372 		    .ll_obj = ll_obj,
3373 		    .zap_obj = zap_obj
3374 		};
3375 		zfs_dbgmsg("deletion of livelist %llu completed",
3376 		    (u_longlong_t)ll_obj);
3377 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
3378 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
3379 	}
3380 }
3381 
3382 static void
spa_start_livelist_destroy_thread(spa_t * spa)3383 spa_start_livelist_destroy_thread(spa_t *spa)
3384 {
3385 	ASSERT0P(spa->spa_livelist_delete_zthr);
3386 	spa->spa_livelist_delete_zthr =
3387 	    zthr_create("z_livelist_destroy",
3388 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
3389 	    minclsyspri);
3390 }
3391 
3392 typedef struct livelist_new_arg {
3393 	bplist_t *allocs;
3394 	bplist_t *frees;
3395 } livelist_new_arg_t;
3396 
3397 static int
livelist_track_new_cb(void * arg,const blkptr_t * bp,boolean_t bp_freed,dmu_tx_t * tx)3398 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
3399     dmu_tx_t *tx)
3400 {
3401 	ASSERT0P(tx);
3402 	livelist_new_arg_t *lna = arg;
3403 	if (bp_freed) {
3404 		bplist_append(lna->frees, bp);
3405 	} else {
3406 		bplist_append(lna->allocs, bp);
3407 		zfs_livelist_condense_new_alloc++;
3408 	}
3409 	return (0);
3410 }
3411 
3412 typedef struct livelist_condense_arg {
3413 	spa_t *spa;
3414 	bplist_t to_keep;
3415 	uint64_t first_size;
3416 	uint64_t next_size;
3417 } livelist_condense_arg_t;
3418 
3419 static void
spa_livelist_condense_sync(void * arg,dmu_tx_t * tx)3420 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
3421 {
3422 	livelist_condense_arg_t *lca = arg;
3423 	spa_t *spa = lca->spa;
3424 	bplist_t new_frees;
3425 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
3426 
3427 	/* Have we been cancelled? */
3428 	if (spa->spa_to_condense.cancelled) {
3429 		zfs_livelist_condense_sync_cancel++;
3430 		goto out;
3431 	}
3432 
3433 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
3434 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
3435 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
3436 
3437 	/*
3438 	 * It's possible that the livelist was changed while the zthr was
3439 	 * running. Therefore, we need to check for new blkptrs in the two
3440 	 * entries being condensed and continue to track them in the livelist.
3441 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
3442 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
3443 	 * we need to sort them into two different bplists.
3444 	 */
3445 	uint64_t first_obj = first->dle_bpobj.bpo_object;
3446 	uint64_t next_obj = next->dle_bpobj.bpo_object;
3447 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
3448 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
3449 
3450 	bplist_create(&new_frees);
3451 	livelist_new_arg_t new_bps = {
3452 	    .allocs = &lca->to_keep,
3453 	    .frees = &new_frees,
3454 	};
3455 
3456 	if (cur_first_size > lca->first_size) {
3457 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
3458 		    livelist_track_new_cb, &new_bps, lca->first_size));
3459 	}
3460 	if (cur_next_size > lca->next_size) {
3461 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
3462 		    livelist_track_new_cb, &new_bps, lca->next_size));
3463 	}
3464 
3465 	dsl_deadlist_clear_entry(first, ll, tx);
3466 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
3467 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
3468 
3469 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
3470 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
3471 	bplist_destroy(&new_frees);
3472 
3473 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
3474 	dsl_dataset_name(ds, dsname);
3475 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
3476 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
3477 	    "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
3478 	    (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
3479 	    (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
3480 	    (u_longlong_t)cur_next_size,
3481 	    (u_longlong_t)first->dle_bpobj.bpo_object,
3482 	    (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
3483 out:
3484 	dmu_buf_rele(ds->ds_dbuf, spa);
3485 	spa->spa_to_condense.ds = NULL;
3486 	bplist_clear(&lca->to_keep);
3487 	bplist_destroy(&lca->to_keep);
3488 	kmem_free(lca, sizeof (livelist_condense_arg_t));
3489 	spa->spa_to_condense.syncing = B_FALSE;
3490 }
3491 
3492 static void
spa_livelist_condense_cb(void * arg,zthr_t * t)3493 spa_livelist_condense_cb(void *arg, zthr_t *t)
3494 {
3495 	while (zfs_livelist_condense_zthr_pause &&
3496 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
3497 		delay(1);
3498 
3499 	spa_t *spa = arg;
3500 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
3501 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
3502 	uint64_t first_size, next_size;
3503 
3504 	livelist_condense_arg_t *lca =
3505 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
3506 	bplist_create(&lca->to_keep);
3507 
3508 	/*
3509 	 * Process the livelists (matching FREEs and ALLOCs) in open context
3510 	 * so we have minimal work in syncing context to condense.
3511 	 *
3512 	 * We save bpobj sizes (first_size and next_size) to use later in
3513 	 * syncing context to determine if entries were added to these sublists
3514 	 * while in open context. This is possible because the clone is still
3515 	 * active and open for normal writes and we want to make sure the new,
3516 	 * unprocessed blockpointers are inserted into the livelist normally.
3517 	 *
3518 	 * Note that dsl_process_sub_livelist() both stores the size number of
3519 	 * blockpointers and iterates over them while the bpobj's lock held, so
3520 	 * the sizes returned to us are consistent which what was actually
3521 	 * processed.
3522 	 */
3523 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
3524 	    &first_size);
3525 	if (err == 0)
3526 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
3527 		    t, &next_size);
3528 
3529 	if (err == 0) {
3530 		while (zfs_livelist_condense_sync_pause &&
3531 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
3532 			delay(1);
3533 
3534 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
3535 		dmu_tx_mark_netfree(tx);
3536 		dmu_tx_hold_space(tx, 1);
3537 		err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE);
3538 		if (err == 0) {
3539 			/*
3540 			 * Prevent the condense zthr restarting before
3541 			 * the synctask completes.
3542 			 */
3543 			spa->spa_to_condense.syncing = B_TRUE;
3544 			lca->spa = spa;
3545 			lca->first_size = first_size;
3546 			lca->next_size = next_size;
3547 			dsl_sync_task_nowait(spa_get_dsl(spa),
3548 			    spa_livelist_condense_sync, lca, tx);
3549 			dmu_tx_commit(tx);
3550 			return;
3551 		}
3552 	}
3553 	/*
3554 	 * Condensing can not continue: either it was externally stopped or
3555 	 * we were unable to assign to a tx because the pool has run out of
3556 	 * space. In the second case, we'll just end up trying to condense
3557 	 * again in a later txg.
3558 	 */
3559 	ASSERT(err != 0);
3560 	bplist_clear(&lca->to_keep);
3561 	bplist_destroy(&lca->to_keep);
3562 	kmem_free(lca, sizeof (livelist_condense_arg_t));
3563 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
3564 	spa->spa_to_condense.ds = NULL;
3565 	if (err == EINTR)
3566 		zfs_livelist_condense_zthr_cancel++;
3567 }
3568 
3569 /*
3570  * Check that there is something to condense but that a condense is not
3571  * already in progress and that condensing has not been cancelled.
3572  */
3573 static boolean_t
spa_livelist_condense_cb_check(void * arg,zthr_t * z)3574 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
3575 {
3576 	(void) z;
3577 	spa_t *spa = arg;
3578 	if ((spa->spa_to_condense.ds != NULL) &&
3579 	    (spa->spa_to_condense.syncing == B_FALSE) &&
3580 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
3581 		return (B_TRUE);
3582 	}
3583 	return (B_FALSE);
3584 }
3585 
3586 static void
spa_start_livelist_condensing_thread(spa_t * spa)3587 spa_start_livelist_condensing_thread(spa_t *spa)
3588 {
3589 	spa->spa_to_condense.ds = NULL;
3590 	spa->spa_to_condense.first = NULL;
3591 	spa->spa_to_condense.next = NULL;
3592 	spa->spa_to_condense.syncing = B_FALSE;
3593 	spa->spa_to_condense.cancelled = B_FALSE;
3594 
3595 	ASSERT0P(spa->spa_livelist_condense_zthr);
3596 	spa->spa_livelist_condense_zthr =
3597 	    zthr_create("z_livelist_condense",
3598 	    spa_livelist_condense_cb_check,
3599 	    spa_livelist_condense_cb, spa, minclsyspri);
3600 }
3601 
3602 static void
spa_spawn_aux_threads(spa_t * spa)3603 spa_spawn_aux_threads(spa_t *spa)
3604 {
3605 	ASSERT(spa_writeable(spa));
3606 
3607 	spa_start_raidz_expansion_thread(spa);
3608 	spa_start_indirect_condensing_thread(spa);
3609 	spa_start_livelist_destroy_thread(spa);
3610 	spa_start_livelist_condensing_thread(spa);
3611 
3612 	ASSERT0P(spa->spa_checkpoint_discard_zthr);
3613 	spa->spa_checkpoint_discard_zthr =
3614 	    zthr_create("z_checkpoint_discard",
3615 	    spa_checkpoint_discard_thread_check,
3616 	    spa_checkpoint_discard_thread, spa, minclsyspri);
3617 }
3618 
3619 /*
3620  * Fix up config after a partly-completed split.  This is done with the
3621  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
3622  * pool have that entry in their config, but only the splitting one contains
3623  * a list of all the guids of the vdevs that are being split off.
3624  *
3625  * This function determines what to do with that list: either rejoin
3626  * all the disks to the pool, or complete the splitting process.  To attempt
3627  * the rejoin, each disk that is offlined is marked online again, and
3628  * we do a reopen() call.  If the vdev label for every disk that was
3629  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
3630  * then we call vdev_split() on each disk, and complete the split.
3631  *
3632  * Otherwise we leave the config alone, with all the vdevs in place in
3633  * the original pool.
3634  */
3635 static void
spa_try_repair(spa_t * spa,nvlist_t * config)3636 spa_try_repair(spa_t *spa, nvlist_t *config)
3637 {
3638 	uint_t extracted;
3639 	uint64_t *glist;
3640 	uint_t i, gcount;
3641 	nvlist_t *nvl;
3642 	vdev_t **vd;
3643 	boolean_t attempt_reopen;
3644 
3645 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
3646 		return;
3647 
3648 	/* check that the config is complete */
3649 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
3650 	    &glist, &gcount) != 0)
3651 		return;
3652 
3653 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
3654 
3655 	/* attempt to online all the vdevs & validate */
3656 	attempt_reopen = B_TRUE;
3657 	for (i = 0; i < gcount; i++) {
3658 		if (glist[i] == 0)	/* vdev is hole */
3659 			continue;
3660 
3661 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
3662 		if (vd[i] == NULL) {
3663 			/*
3664 			 * Don't bother attempting to reopen the disks;
3665 			 * just do the split.
3666 			 */
3667 			attempt_reopen = B_FALSE;
3668 		} else {
3669 			/* attempt to re-online it */
3670 			vd[i]->vdev_offline = B_FALSE;
3671 		}
3672 	}
3673 
3674 	if (attempt_reopen) {
3675 		vdev_reopen(spa->spa_root_vdev);
3676 
3677 		/* check each device to see what state it's in */
3678 		for (extracted = 0, i = 0; i < gcount; i++) {
3679 			if (vd[i] != NULL &&
3680 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
3681 				break;
3682 			++extracted;
3683 		}
3684 	}
3685 
3686 	/*
3687 	 * If every disk has been moved to the new pool, or if we never
3688 	 * even attempted to look at them, then we split them off for
3689 	 * good.
3690 	 */
3691 	if (!attempt_reopen || gcount == extracted) {
3692 		for (i = 0; i < gcount; i++)
3693 			if (vd[i] != NULL)
3694 				vdev_split(vd[i]);
3695 		vdev_reopen(spa->spa_root_vdev);
3696 	}
3697 
3698 	kmem_free(vd, gcount * sizeof (vdev_t *));
3699 }
3700 
3701 static int
spa_load(spa_t * spa,spa_load_state_t state,spa_import_type_t type)3702 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
3703 {
3704 	const char *ereport = FM_EREPORT_ZFS_POOL;
3705 	int error;
3706 
3707 	spa->spa_load_state = state;
3708 	(void) spa_import_progress_set_state(spa_guid(spa),
3709 	    spa_load_state(spa));
3710 	spa_import_progress_set_notes(spa, "spa_load()");
3711 
3712 	gethrestime(&spa->spa_loaded_ts);
3713 	error = spa_load_impl(spa, type, &ereport);
3714 
3715 	/*
3716 	 * Don't count references from objsets that are already closed
3717 	 * and are making their way through the eviction process.
3718 	 */
3719 	spa_evicting_os_wait(spa);
3720 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
3721 	if (error) {
3722 		if (error != EEXIST) {
3723 			spa->spa_loaded_ts.tv_sec = 0;
3724 			spa->spa_loaded_ts.tv_nsec = 0;
3725 		}
3726 		if (error != EBADF) {
3727 			(void) zfs_ereport_post(ereport, spa,
3728 			    NULL, NULL, NULL, 0);
3729 		}
3730 	}
3731 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
3732 	spa->spa_ena = 0;
3733 
3734 	(void) spa_import_progress_set_state(spa_guid(spa),
3735 	    spa_load_state(spa));
3736 
3737 	return (error);
3738 }
3739 
3740 #ifdef ZFS_DEBUG
3741 /*
3742  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
3743  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
3744  * spa's per-vdev ZAP list.
3745  */
3746 static uint64_t
vdev_count_verify_zaps(vdev_t * vd)3747 vdev_count_verify_zaps(vdev_t *vd)
3748 {
3749 	spa_t *spa = vd->vdev_spa;
3750 	uint64_t total = 0;
3751 
3752 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
3753 	    vd->vdev_root_zap != 0) {
3754 		total++;
3755 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3756 		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
3757 	}
3758 	if (vd->vdev_top_zap != 0) {
3759 		total++;
3760 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3761 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
3762 	}
3763 	if (vd->vdev_leaf_zap != 0) {
3764 		total++;
3765 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3766 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
3767 	}
3768 
3769 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
3770 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
3771 	}
3772 
3773 	return (total);
3774 }
3775 #else
3776 #define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
3777 #endif
3778 
3779 /*
3780  * Check the results load_info results from previous tryimport.
3781  *
3782  * error results:
3783  *          0 - Pool remains in an idle state
3784  *  EREMOTEIO - Pool was known to be active on the other host
3785  *     ENOENT - The config does not contain complete tryimport info
3786  */
3787 static int
spa_activity_verify_config(spa_t * spa,uberblock_t * ub)3788 spa_activity_verify_config(spa_t *spa, uberblock_t *ub)
3789 {
3790 	uint64_t tryconfig_mmp_state = MMP_STATE_ACTIVE;
3791 	uint64_t tryconfig_txg = 0;
3792 	uint64_t tryconfig_timestamp = 0;
3793 	uint16_t tryconfig_mmp_seq = 0;
3794 	nvlist_t *nvinfo, *config = spa->spa_config;
3795 	int error;
3796 
3797 	/* Simply a non-zero value to indicate the verify was done. */
3798 	spa->spa_mmp.mmp_import_ns = 1000;
3799 
3800 	error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo);
3801 	if (error)
3802 		return (SET_ERROR(ENOENT));
3803 
3804 	/*
3805 	 * If ZPOOL_CONFIG_MMP_STATE is present an activity check was performed
3806 	 * during the earlier tryimport.  If the state recorded there isn't
3807 	 * MMP_STATE_INACTIVE the pool is known to be active on another host.
3808 	 */
3809 	error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE,
3810 	    &tryconfig_mmp_state);
3811 	if (error)
3812 		return (SET_ERROR(ENOENT));
3813 
3814 	if (tryconfig_mmp_state != MMP_STATE_INACTIVE) {
3815 		spa_load_failed(spa, "mmp: pool is active on remote host, "
3816 		    "state=%llu", (u_longlong_t)tryconfig_mmp_state);
3817 		return (SET_ERROR(EREMOTEIO));
3818 	}
3819 
3820 	/*
3821 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
3822 	 * during the earlier tryimport.  If the txg recorded there is 0 then
3823 	 * the pool is known to be active on another host.
3824 	 */
3825 	error = nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
3826 	    &tryconfig_txg);
3827 	if (error)
3828 		return (SET_ERROR(ENOENT));
3829 
3830 	if (tryconfig_txg == 0) {
3831 		spa_load_failed(spa, "mmp: pool is active on remote host, "
3832 		    "tryconfig_txg=%llu", (u_longlong_t)tryconfig_txg);
3833 		return (SET_ERROR(EREMOTEIO));
3834 	}
3835 
3836 	error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3837 	    &tryconfig_timestamp);
3838 	if (error)
3839 		return (SET_ERROR(ENOENT));
3840 
3841 	error = nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
3842 	    &tryconfig_mmp_seq);
3843 	if (error)
3844 		return (SET_ERROR(ENOENT));
3845 
3846 	if (tryconfig_timestamp == ub->ub_timestamp &&
3847 	    tryconfig_txg == ub->ub_txg &&
3848 	    MMP_SEQ_VALID(ub) && tryconfig_mmp_seq == MMP_SEQ(ub)) {
3849 		zfs_dbgmsg("mmp: verified pool mmp tryimport config, "
3850 		    "spa=%s", spa_load_name(spa));
3851 		return (0);
3852 	}
3853 
3854 	spa_load_failed(spa, "mmp: pool is active on remote host, "
3855 	    "tc_timestamp=%llu ub_timestamp=%llu "
3856 	    "tc_txg=%llu ub_txg=%llu tc_seq=%llu ub_seq=%llu",
3857 	    (u_longlong_t)tryconfig_timestamp, (u_longlong_t)ub->ub_timestamp,
3858 	    (u_longlong_t)tryconfig_txg, (u_longlong_t)ub->ub_txg,
3859 	    (u_longlong_t)tryconfig_mmp_seq, (u_longlong_t)MMP_SEQ(ub));
3860 
3861 	return (SET_ERROR(EREMOTEIO));
3862 }
3863 
3864 /*
3865  * Determine whether the activity check is required.
3866  */
3867 static boolean_t
spa_activity_check_required(spa_t * spa,uberblock_t * ub,nvlist_t * label)3868 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label)
3869 {
3870 	nvlist_t *config = spa->spa_config;
3871 	uint64_t state = POOL_STATE_ACTIVE;
3872 	uint64_t hostid = 0;
3873 
3874 	/*
3875 	 * Disable the MMP activity check - This is used by zdb which
3876 	 * is always read-only and intended to be used on potentially
3877 	 * active pools.
3878 	 */
3879 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) {
3880 		zfs_dbgmsg("mmp: skipping check ZFS_IMPORT_SKIP_MMP is set, "
3881 		    "spa=%s", spa_load_name(spa));
3882 		return (B_FALSE);
3883 	}
3884 
3885 	/*
3886 	 * Skip the activity check when the MMP feature is disabled.
3887 	 * - MMP_MAGIC not set - Legacy pool predates the MMP feature, or
3888 	 * - MMP_MAGIC set && mmp_delay == 0 - MMP feature is disabled.
3889 	 */
3890 	if ((ub->ub_mmp_magic != MMP_MAGIC) ||
3891 	    (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)) {
3892 		zfs_dbgmsg("mmp: skipping check: feature is disabled, "
3893 		    "spa=%s", spa_load_name(spa));
3894 		return (B_FALSE);
3895 	}
3896 
3897 	/*
3898 	 * Allow the activity check to be skipped when importing a cleanly
3899 	 * exported pool on the same host which last imported it.  Since the
3900 	 * hostid from configuration may be stale use the one read from the
3901 	 * label.  Imports from other hostids must perform the activity check.
3902 	 */
3903 	if (label != NULL) {
3904 		if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
3905 			hostid = fnvlist_lookup_uint64(label,
3906 			    ZPOOL_CONFIG_HOSTID);
3907 
3908 		if (nvlist_exists(config, ZPOOL_CONFIG_POOL_STATE))
3909 			state = fnvlist_lookup_uint64(config,
3910 			    ZPOOL_CONFIG_POOL_STATE);
3911 
3912 		if (spa_get_hostid(spa) && hostid == spa_get_hostid(spa) &&
3913 		    state == POOL_STATE_EXPORTED) {
3914 			zfs_dbgmsg("mmp: skipping check: hostid matches "
3915 			    "and pool is exported, spa=%s, hostid=%llx",
3916 			    spa_load_name(spa), (u_longlong_t)hostid);
3917 			return (B_FALSE);
3918 		}
3919 
3920 		if (state == POOL_STATE_DESTROYED) {
3921 			zfs_dbgmsg("mmp: skipping check: intentionally "
3922 			    "destroyed pool, spa=%s", spa_load_name(spa));
3923 			return (B_FALSE);
3924 		}
3925 	}
3926 
3927 	return (B_TRUE);
3928 }
3929 
3930 /*
3931  * Nanoseconds the activity check must watch for changes on-disk.
3932  */
3933 static uint64_t
spa_activity_check_duration(spa_t * spa,uberblock_t * ub)3934 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
3935 {
3936 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
3937 	uint64_t multihost_interval = MSEC2NSEC(
3938 	    MMP_INTERVAL_OK(zfs_multihost_interval));
3939 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
3940 	    multihost_interval);
3941 
3942 	/*
3943 	 * Local tunables determine a minimum duration except for the case
3944 	 * where we know when the remote host will suspend the pool if MMP
3945 	 * writes do not land.
3946 	 *
3947 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
3948 	 * these cases and times.
3949 	 */
3950 
3951 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
3952 
3953 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3954 	    MMP_FAIL_INT(ub) > 0) {
3955 
3956 		/* MMP on remote host will suspend pool after failed writes */
3957 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
3958 		    MMP_IMPORT_SAFETY_FACTOR / 100;
3959 
3960 		zfs_dbgmsg("mmp: settings spa=%s fail_intvals>0 "
3961 		    "import_delay=%llu mmp_fails=%llu mmp_interval=%llu "
3962 		    "import_intervals=%llu", spa_load_name(spa),
3963 		    (u_longlong_t)import_delay,
3964 		    (u_longlong_t)MMP_FAIL_INT(ub),
3965 		    (u_longlong_t)MMP_INTERVAL(ub),
3966 		    (u_longlong_t)import_intervals);
3967 
3968 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3969 	    MMP_FAIL_INT(ub) == 0) {
3970 
3971 		/* MMP on remote host will never suspend pool */
3972 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
3973 		    ub->ub_mmp_delay) * import_intervals);
3974 
3975 		zfs_dbgmsg("mmp: settings spa=%s fail_intvals=0 "
3976 		    "import_delay=%llu mmp_interval=%llu ub_mmp_delay=%llu "
3977 		    "import_intervals=%llu", spa_load_name(spa),
3978 		    (u_longlong_t)import_delay,
3979 		    (u_longlong_t)MMP_INTERVAL(ub),
3980 		    (u_longlong_t)ub->ub_mmp_delay,
3981 		    (u_longlong_t)import_intervals);
3982 
3983 	} else if (MMP_VALID(ub)) {
3984 		/*
3985 		 * zfs-0.7 compatibility case
3986 		 */
3987 
3988 		import_delay = MAX(import_delay, (multihost_interval +
3989 		    ub->ub_mmp_delay) * import_intervals);
3990 
3991 		zfs_dbgmsg("mmp: settings spa=%s import_delay=%llu "
3992 		    "ub_mmp_delay=%llu import_intervals=%llu leaves=%u",
3993 		    spa_load_name(spa), (u_longlong_t)import_delay,
3994 		    (u_longlong_t)ub->ub_mmp_delay,
3995 		    (u_longlong_t)import_intervals,
3996 		    vdev_count_leaves(spa));
3997 	} else {
3998 		/* Using local tunings is the only reasonable option */
3999 		zfs_dbgmsg("mmp: pool last imported on non-MMP aware "
4000 		    "host using settings spa=%s import_delay=%llu "
4001 		    "multihost_interval=%llu import_intervals=%llu",
4002 		    spa_load_name(spa), (u_longlong_t)import_delay,
4003 		    (u_longlong_t)multihost_interval,
4004 		    (u_longlong_t)import_intervals);
4005 	}
4006 
4007 	return (import_delay);
4008 }
4009 
4010 /*
4011  * Store the observed pool status in spa->spa_load_info nvlist.  If the
4012  * remote hostname or hostid are available from configuration read from
4013  * disk store them as well.  Additionally, provide some diagnostic info
4014  * for which activity checks were run and their duration.  This allows
4015  * 'zpool import' to generate a more useful message.
4016  *
4017  * Mandatory observed pool status
4018  * - ZPOOL_CONFIG_MMP_STATE        - observed pool status (active/inactive)
4019  * - ZPOOL_CONFIG_MMP_TXG          - observed pool txg number
4020  * - ZPOOL_CONFIG_MMP_SEQ          - observed pool sequence id
4021  *
4022  * Optional information for detailed reporting
4023  * - ZPOOL_CONFIG_MMP_HOSTNAME     - hostname from the active pool
4024  * - ZPOOL_CONFIG_MMP_HOSTID       - hostid from the active pool
4025  * - ZPOOL_CONFIG_MMP_RESULT	 - set to result of activity check
4026  * - ZPOOL_CONFIG_MMP_TRYIMPORT_NS - tryimport duration in nanosec
4027  * - ZPOOL_CONFIG_MMP_IMPORT_NS    - import duration in nanosec
4028  * - ZPOOL_CONFIG_MMP_CLAIM_NS     - claim duration in nanosec
4029  *
4030  * ZPOOL_CONFIG_MMP_RESULT can be set to:
4031  * - ENXIO	- system hostid not set
4032  * - ESRCH	- activity check skipped
4033  * - EREMOTEIO	- activity check detected active pool
4034  * - EINTR	- activity check interrupted
4035  * - 0		- activity check detected no activity
4036  */
4037 static void
spa_activity_set_load_info(spa_t * spa,nvlist_t * label,mmp_state_t state,uint64_t txg,uint16_t seq,int error)4038 spa_activity_set_load_info(spa_t *spa, nvlist_t *label, mmp_state_t state,
4039     uint64_t txg, uint16_t seq, int error)
4040 {
4041 	mmp_thread_t *mmp = &spa->spa_mmp;
4042 	const char *hostname = NULL;
4043 	uint64_t hostid = 0;
4044 
4045 	/* Always report a zero txg and seq id for active pools. */
4046 	if (state == MMP_STATE_ACTIVE) {
4047 		ASSERT0(txg);
4048 		ASSERT0(seq);
4049 	}
4050 
4051 	if (label) {
4052 		if (nvlist_exists(label, ZPOOL_CONFIG_HOSTNAME)) {
4053 			hostname = fnvlist_lookup_string(label,
4054 			    ZPOOL_CONFIG_HOSTNAME);
4055 			fnvlist_add_string(spa->spa_load_info,
4056 			    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
4057 		}
4058 
4059 		if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) {
4060 			hostid = fnvlist_lookup_uint64(label,
4061 			    ZPOOL_CONFIG_HOSTID);
4062 			fnvlist_add_uint64(spa->spa_load_info,
4063 			    ZPOOL_CONFIG_MMP_HOSTID, hostid);
4064 		}
4065 	}
4066 
4067 	fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, state);
4068 	fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, txg);
4069 	fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, seq);
4070 	fnvlist_add_uint32(spa->spa_load_info, ZPOOL_CONFIG_MMP_RESULT, error);
4071 
4072 	if (mmp->mmp_tryimport_ns > 0) {
4073 		fnvlist_add_uint64(spa->spa_load_info,
4074 		    ZPOOL_CONFIG_MMP_TRYIMPORT_NS, mmp->mmp_tryimport_ns);
4075 	}
4076 
4077 	if (mmp->mmp_import_ns > 0) {
4078 		fnvlist_add_uint64(spa->spa_load_info,
4079 		    ZPOOL_CONFIG_MMP_IMPORT_NS, mmp->mmp_import_ns);
4080 	}
4081 
4082 	if (mmp->mmp_claim_ns > 0) {
4083 		fnvlist_add_uint64(spa->spa_load_info,
4084 		    ZPOOL_CONFIG_MMP_CLAIM_NS, mmp->mmp_claim_ns);
4085 	}
4086 
4087 	zfs_dbgmsg("mmp: set spa_load_info, spa=%s hostname=%s hostid=%llx "
4088 	    "state=%d txg=%llu seq=%llu tryimport_ns=%lld import_ns=%lld "
4089 	    "claim_ns=%lld", spa_load_name(spa),
4090 	    hostname != NULL ? hostname : "none", (u_longlong_t)hostid,
4091 	    (int)state, (u_longlong_t)txg, (u_longlong_t)seq,
4092 	    (longlong_t)mmp->mmp_tryimport_ns, (longlong_t)mmp->mmp_import_ns,
4093 	    (longlong_t)mmp->mmp_claim_ns);
4094 }
4095 
4096 static int
spa_ld_activity_result(spa_t * spa,int error,const char * state)4097 spa_ld_activity_result(spa_t *spa, int error, const char *state)
4098 {
4099 	switch (error) {
4100 	case ENXIO:
4101 		cmn_err(CE_WARN, "pool '%s' system hostid not set, "
4102 		    "aborted import during %s", spa_load_name(spa), state);
4103 		/* Userspace expects EREMOTEIO for no system hostid */
4104 		error = EREMOTEIO;
4105 		break;
4106 	case EREMOTEIO:
4107 		cmn_err(CE_WARN, "pool '%s' activity detected, aborted "
4108 		    "import during %s", spa_load_name(spa), state);
4109 		break;
4110 	case EINTR:
4111 		cmn_err(CE_WARN, "pool '%s' activity check, interrupted "
4112 		    "import during %s", spa_load_name(spa), state);
4113 		break;
4114 	case 0:
4115 		cmn_err(CE_NOTE, "pool '%s' activity check completed "
4116 		    "successfully", spa_load_name(spa));
4117 		break;
4118 	}
4119 
4120 	return (error);
4121 }
4122 
4123 
4124 /*
4125  * Remote host activity check.  Performed during tryimport when the pool
4126  * has passed on the basic sanity check and is open read-only.
4127  *
4128  * error results:
4129  *          0 - no activity detected
4130  *  EREMOTEIO - remote activity detected
4131  *      EINTR - user canceled the operation
4132  */
4133 static int
spa_activity_check_tryimport(spa_t * spa,uberblock_t * spa_ub,boolean_t importing)4134 spa_activity_check_tryimport(spa_t *spa, uberblock_t *spa_ub,
4135     boolean_t importing)
4136 {
4137 	kcondvar_t cv;
4138 	kmutex_t mtx;
4139 	int error = 0;
4140 
4141 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
4142 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
4143 	mutex_enter(&mtx);
4144 
4145 	uint64_t import_delay = spa_activity_check_duration(spa, spa_ub);
4146 	hrtime_t start_time = gethrtime();
4147 
4148 	/* Add a small random factor in case of simultaneous imports (0-25%) */
4149 	import_delay += import_delay * random_in_range(250) / 1000;
4150 	hrtime_t import_expire = gethrtime() + import_delay;
4151 
4152 	if (importing) {
4153 		/* Console message includes tryimport and claim time */
4154 		hrtime_t extra_delay = MMP_IMPORT_VERIFY_ITERS *
4155 		    MSEC2NSEC(MMP_INTERVAL_VALID(spa_ub) ?
4156 		    MMP_INTERVAL(spa_ub) : MMP_MIN_INTERVAL);
4157 		cmn_err(CE_NOTE, "pool '%s' activity check required, "
4158 		    "%llu seconds remaining", spa_load_name(spa),
4159 		    (u_longlong_t)MAX(NSEC2SEC(import_delay + extra_delay), 1));
4160 		spa_import_progress_set_notes(spa, "Checking MMP activity, "
4161 		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
4162 	}
4163 
4164 	hrtime_t now;
4165 	nvlist_t *mmp_label = NULL;
4166 
4167 	while ((now = gethrtime()) < import_expire) {
4168 		vdev_t *rvd = spa->spa_root_vdev;
4169 		uberblock_t mmp_ub;
4170 
4171 		if (importing) {
4172 			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
4173 			    NSEC2SEC(import_expire - gethrtime()));
4174 		}
4175 
4176 		vdev_uberblock_load(rvd, &mmp_ub, &mmp_label);
4177 
4178 		if (vdev_uberblock_compare(spa_ub, &mmp_ub)) {
4179 			spa_load_failed(spa, "mmp: activity detected during "
4180 			    "tryimport, spa_ub_txg=%llu mmp_ub_txg=%llu "
4181 			    "spa_ub_seq=%llu mmp_ub_seq=%llu "
4182 			    "spa_ub_timestamp=%llu mmp_ub_timestamp=%llu "
4183 			    "spa_ub_config=%#llx mmp_ub_config=%#llx",
4184 			    (u_longlong_t)spa_ub->ub_txg,
4185 			    (u_longlong_t)mmp_ub.ub_txg,
4186 			    (u_longlong_t)(MMP_SEQ_VALID(spa_ub) ?
4187 			    MMP_SEQ(spa_ub) : 0),
4188 			    (u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ?
4189 			    MMP_SEQ(&mmp_ub) : 0),
4190 			    (u_longlong_t)spa_ub->ub_timestamp,
4191 			    (u_longlong_t)mmp_ub.ub_timestamp,
4192 			    (u_longlong_t)spa_ub->ub_mmp_config,
4193 			    (u_longlong_t)mmp_ub.ub_mmp_config);
4194 			error = SET_ERROR(EREMOTEIO);
4195 			break;
4196 		}
4197 
4198 		if (mmp_label) {
4199 			nvlist_free(mmp_label);
4200 			mmp_label = NULL;
4201 		}
4202 
4203 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
4204 		if (error != -1) {
4205 			error = SET_ERROR(EINTR);
4206 			break;
4207 		}
4208 		error = 0;
4209 	}
4210 
4211 	mutex_exit(&mtx);
4212 	mutex_destroy(&mtx);
4213 	cv_destroy(&cv);
4214 
4215 	if (mmp_label)
4216 		nvlist_free(mmp_label);
4217 
4218 	if (spa->spa_load_state == SPA_LOAD_IMPORT ||
4219 	    spa->spa_load_state == SPA_LOAD_OPEN) {
4220 		spa->spa_mmp.mmp_import_ns = gethrtime() - start_time;
4221 	} else {
4222 		spa->spa_mmp.mmp_tryimport_ns = gethrtime() - start_time;
4223 	}
4224 
4225 	return (error);
4226 }
4227 
4228 /*
4229  * Remote host activity check.  Performed during import when the pool has
4230  * passed most sanity check and has been reopened read/write.
4231  *
4232  * error results:
4233  *          0 - no activity detected
4234  *  EREMOTEIO - remote activity detected
4235  *      EINTR - user canceled the operation
4236  */
4237 static int
spa_activity_check_claim(spa_t * spa)4238 spa_activity_check_claim(spa_t *spa)
4239 {
4240 	vdev_t *rvd = spa->spa_root_vdev;
4241 	nvlist_t *mmp_label;
4242 	uberblock_t spa_ub;
4243 	kcondvar_t cv;
4244 	kmutex_t mtx;
4245 	int error = 0;
4246 
4247 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
4248 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
4249 	mutex_enter(&mtx);
4250 
4251 	hrtime_t start_time = gethrtime();
4252 
4253 	/*
4254 	 * Load the best uberblock and verify it matches the uberblock already
4255 	 * identified and stored as spa->spa_uberblock to verify the pool has
4256 	 * not changed.
4257 	 */
4258 	vdev_uberblock_load(rvd, &spa_ub, &mmp_label);
4259 
4260 	if (memcmp(&spa->spa_uberblock, &spa_ub, sizeof (uberblock_t))) {
4261 		spa_load_failed(spa, "mmp: uberblock changed on disk");
4262 		error = SET_ERROR(EREMOTEIO);
4263 		goto out;
4264 	}
4265 
4266 	if (!MMP_VALID(&spa_ub) || !MMP_INTERVAL_VALID(&spa_ub) ||
4267 	    !MMP_SEQ_VALID(&spa_ub) || !MMP_FAIL_INT_VALID(&spa_ub)) {
4268 		spa_load_failed(spa, "mmp: is not enabled in spa uberblock");
4269 		error = SET_ERROR(EREMOTEIO);
4270 		goto out;
4271 	}
4272 
4273 	nvlist_free(mmp_label);
4274 	mmp_label = NULL;
4275 
4276 	uint64_t spa_ub_interval = MMP_INTERVAL(&spa_ub);
4277 	uint16_t spa_ub_seq = MMP_SEQ(&spa_ub);
4278 
4279 	/*
4280 	 * In the highly unlikely event the sequence numbers have been
4281 	 * exhaused reset the sequence to zero.  As long as the MMP
4282 	 * uberblock is updated on all of the vdevs the activity will
4283 	 * still be detected.
4284 	 */
4285 	if (MMP_SEQ_MAX == spa_ub_seq)
4286 		spa_ub_seq = 0;
4287 
4288 	spa_import_progress_set_notes(spa,
4289 	    "Establishing MMP claim, waiting %llu ms",
4290 	    (u_longlong_t)(MMP_IMPORT_VERIFY_ITERS * spa_ub_interval));
4291 
4292 	/*
4293 	 * Repeatedly sync out an MMP uberblock with a randomly selected
4294 	 * sequence number, then read it back after the MMP interval.  This
4295 	 * random value acts as a claim token and is visible on other hosts.
4296 	 * If the same random value is read back we can be certain no other
4297 	 * pool is attempting to import the pool.
4298 	 */
4299 	for (int i = MMP_IMPORT_VERIFY_ITERS; i > 0; i--) {
4300 		uberblock_t set_ub, mmp_ub;
4301 		uint16_t mmp_seq;
4302 
4303 		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
4304 		    NSEC2SEC(i * MSEC2NSEC(spa_ub_interval)));
4305 
4306 		set_ub = spa_ub;
4307 		mmp_seq = spa_ub_seq + 1 +
4308 		    random_in_range(MMP_SEQ_MAX - spa_ub_seq);
4309 		MMP_SEQ_CLEAR(&set_ub);
4310 		set_ub.ub_mmp_config |= MMP_SEQ_SET(mmp_seq);
4311 
4312 		error = mmp_claim_uberblock(spa, rvd, &set_ub);
4313 		if (error) {
4314 			spa_load_failed(spa, "mmp: uberblock claim "
4315 			    "failed, error=%d", error);
4316 			error = SET_ERROR(EREMOTEIO);
4317 			break;
4318 		}
4319 
4320 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() +
4321 		    MSEC_TO_TICK(spa_ub_interval));
4322 		if (error != -1) {
4323 			error = SET_ERROR(EINTR);
4324 			break;
4325 		}
4326 
4327 		vdev_uberblock_load(rvd, &mmp_ub, &mmp_label);
4328 
4329 		if (vdev_uberblock_compare(&set_ub, &mmp_ub)) {
4330 			spa_load_failed(spa, "mmp: activity detected during "
4331 			    "claim, set_ub_txg=%llu mmp_ub_txg=%llu "
4332 			    "set_ub_seq=%llu mmp_ub_seq=%llu "
4333 			    "set_ub_timestamp=%llu mmp_ub_timestamp=%llu "
4334 			    "set_ub_config=%#llx mmp_ub_config=%#llx",
4335 			    (u_longlong_t)set_ub.ub_txg,
4336 			    (u_longlong_t)mmp_ub.ub_txg,
4337 			    (u_longlong_t)(MMP_SEQ_VALID(&set_ub) ?
4338 			    MMP_SEQ(&set_ub) : 0),
4339 			    (u_longlong_t)(MMP_SEQ_VALID(&mmp_ub) ?
4340 			    MMP_SEQ(&mmp_ub) : 0),
4341 			    (u_longlong_t)set_ub.ub_timestamp,
4342 			    (u_longlong_t)mmp_ub.ub_timestamp,
4343 			    (u_longlong_t)set_ub.ub_mmp_config,
4344 			    (u_longlong_t)mmp_ub.ub_mmp_config);
4345 			error = SET_ERROR(EREMOTEIO);
4346 			break;
4347 		}
4348 
4349 		if (mmp_label) {
4350 			nvlist_free(mmp_label);
4351 			mmp_label = NULL;
4352 		}
4353 
4354 		error = 0;
4355 	}
4356 out:
4357 	spa->spa_mmp.mmp_claim_ns = gethrtime() - start_time;
4358 	(void) spa_import_progress_set_mmp_check(spa_guid(spa), 0);
4359 
4360 	if (error == EREMOTEIO) {
4361 		spa_activity_set_load_info(spa, mmp_label,
4362 		    MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
4363 	} else {
4364 		spa_activity_set_load_info(spa, mmp_label,
4365 		    MMP_STATE_INACTIVE, spa_ub.ub_txg, MMP_SEQ(&spa_ub), 0);
4366 	}
4367 
4368 	/*
4369 	 * Restore the original sequence, this allows us to retry the
4370 	 * import procedure if a subsequent step fails during import.
4371 	 * Failure to restore it reduces the available sequence ids for
4372 	 * the next import but shouldn't be considered fatal.
4373 	 */
4374 	int restore_error = mmp_claim_uberblock(spa, rvd, &spa_ub);
4375 	if (restore_error) {
4376 		zfs_dbgmsg("mmp: uberblock restore failed, spa=%s error=%d",
4377 		    spa_load_name(spa), restore_error);
4378 	}
4379 
4380 	if (mmp_label)
4381 		nvlist_free(mmp_label);
4382 
4383 	mutex_exit(&mtx);
4384 	mutex_destroy(&mtx);
4385 	cv_destroy(&cv);
4386 
4387 	return (error);
4388 }
4389 
4390 static int
spa_ld_activity_check(spa_t * spa,uberblock_t * ub,nvlist_t * label)4391 spa_ld_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *label)
4392 {
4393 	vdev_t *rvd = spa->spa_root_vdev;
4394 	int error;
4395 
4396 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
4397 	    spa_get_hostid(spa) == 0) {
4398 		spa_activity_set_load_info(spa, label, MMP_STATE_NO_HOSTID,
4399 		    ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, ENXIO);
4400 		zfs_dbgmsg("mmp: system hostid not set, ub_mmp_magic=%llx "
4401 		    "ub_mmp_delay=%llu hostid=%llx",
4402 		    (u_longlong_t)ub->ub_mmp_magic,
4403 		    (u_longlong_t)ub->ub_mmp_delay,
4404 		    (u_longlong_t)spa_get_hostid(spa));
4405 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, ENXIO));
4406 	}
4407 
4408 	switch (spa->spa_load_state) {
4409 	case SPA_LOAD_TRYIMPORT:
4410 tryimport:
4411 		error = spa_activity_check_tryimport(spa, ub, B_TRUE);
4412 		if (error == EREMOTEIO) {
4413 			spa_activity_set_load_info(spa, label,
4414 			    MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
4415 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4416 		} else if (error) {
4417 			ASSERT3S(error, ==, EINTR);
4418 			spa_activity_set_load_info(spa, label,
4419 			    MMP_STATE_ACTIVE, 0, 0, EINTR);
4420 			return (error);
4421 		}
4422 
4423 		spa_activity_set_load_info(spa, label, MMP_STATE_INACTIVE,
4424 		    ub->ub_txg, MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0, 0);
4425 
4426 		break;
4427 
4428 	case SPA_LOAD_IMPORT:
4429 	case SPA_LOAD_OPEN:
4430 		error = spa_activity_verify_config(spa, ub);
4431 		if (error == EREMOTEIO) {
4432 			spa_activity_set_load_info(spa, label,
4433 			    MMP_STATE_ACTIVE, 0, 0, EREMOTEIO);
4434 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4435 		} else if (error) {
4436 			ASSERT3S(error, ==, ENOENT);
4437 			goto tryimport;
4438 		}
4439 
4440 		/* Load info set in spa_activity_check_claim() */
4441 
4442 		break;
4443 
4444 	case SPA_LOAD_RECOVER:
4445 		zfs_dbgmsg("mmp: skipping mmp check for rewind, spa=%s",
4446 		    spa_load_name(spa));
4447 		break;
4448 
4449 	default:
4450 		spa_activity_set_load_info(spa, label, MMP_STATE_ACTIVE,
4451 		    0, 0, EREMOTEIO);
4452 		zfs_dbgmsg("mmp: unreachable, spa=%s spa_load_state=%d",
4453 		    spa_load_name(spa), spa->spa_load_state);
4454 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4455 	}
4456 
4457 	return (0);
4458 }
4459 
4460 /*
4461  * Called from zfs_ioc_clear for a pool that was suspended
4462  * after failing mmp write checks.
4463  */
4464 boolean_t
spa_mmp_remote_host_activity(spa_t * spa)4465 spa_mmp_remote_host_activity(spa_t *spa)
4466 {
4467 	ASSERT(spa_multihost(spa) && spa_suspended(spa));
4468 
4469 	nvlist_t *best_label;
4470 	uberblock_t best_ub;
4471 
4472 	/*
4473 	 * Locate the best uberblock on disk
4474 	 */
4475 	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
4476 	if (best_label) {
4477 		/*
4478 		 * confirm that the best hostid matches our hostid
4479 		 */
4480 		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
4481 		    spa_get_hostid(spa) !=
4482 		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
4483 			nvlist_free(best_label);
4484 			return (B_TRUE);
4485 		}
4486 		nvlist_free(best_label);
4487 	} else {
4488 		return (B_TRUE);
4489 	}
4490 
4491 	if (!MMP_VALID(&best_ub) ||
4492 	    !MMP_FAIL_INT_VALID(&best_ub) ||
4493 	    MMP_FAIL_INT(&best_ub) == 0) {
4494 		return (B_TRUE);
4495 	}
4496 
4497 	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
4498 	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
4499 		zfs_dbgmsg("mmp: txg mismatch detected during pool clear, "
4500 		    "spa=%s txg=%llu ub_txg=%llu timestamp=%llu "
4501 		    "ub_timestamp=%llu", spa_name(spa),
4502 		    (u_longlong_t)spa->spa_uberblock.ub_txg,
4503 		    (u_longlong_t)best_ub.ub_txg,
4504 		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
4505 		    (u_longlong_t)best_ub.ub_timestamp);
4506 		return (B_TRUE);
4507 	}
4508 
4509 	/*
4510 	 * Perform an activity check looking for any remote writer
4511 	 */
4512 	return (spa_activity_check_tryimport(spa, &best_ub, B_FALSE) != 0);
4513 }
4514 
4515 static int
spa_verify_host(spa_t * spa,nvlist_t * mos_config)4516 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
4517 {
4518 	uint64_t hostid;
4519 	const char *hostname;
4520 	uint64_t myhostid = 0;
4521 
4522 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
4523 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
4524 		hostname = fnvlist_lookup_string(mos_config,
4525 		    ZPOOL_CONFIG_HOSTNAME);
4526 
4527 		myhostid = zone_get_hostid(NULL);
4528 
4529 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
4530 			cmn_err(CE_WARN, "pool '%s' could not be "
4531 			    "loaded as it was last accessed by "
4532 			    "another system (host: %s hostid: 0x%llx). "
4533 			    "See: https://openzfs.github.io/openzfs-docs/msg/"
4534 			    "ZFS-8000-EY",
4535 			    spa_name(spa), hostname, (u_longlong_t)hostid);
4536 			spa_load_failed(spa, "hostid verification failed: pool "
4537 			    "last accessed by host: %s (hostid: 0x%llx)",
4538 			    hostname, (u_longlong_t)hostid);
4539 			return (SET_ERROR(EBADF));
4540 		}
4541 	}
4542 
4543 	return (0);
4544 }
4545 
4546 static int
spa_ld_parse_config(spa_t * spa,spa_import_type_t type)4547 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
4548 {
4549 	int error = 0;
4550 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
4551 	int parse;
4552 	vdev_t *rvd;
4553 	uint64_t pool_guid;
4554 	const char *comment;
4555 	const char *compatibility;
4556 
4557 	/*
4558 	 * Versioning wasn't explicitly added to the label until later, so if
4559 	 * it's not present treat it as the initial version.
4560 	 */
4561 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
4562 	    &spa->spa_ubsync.ub_version) != 0)
4563 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
4564 
4565 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
4566 		spa_load_failed(spa, "invalid config provided: '%s' missing",
4567 		    ZPOOL_CONFIG_POOL_GUID);
4568 		return (SET_ERROR(EINVAL));
4569 	}
4570 
4571 	/*
4572 	 * If we are doing an import, ensure that the pool is not already
4573 	 * imported by checking if its pool guid already exists in the
4574 	 * spa namespace.
4575 	 *
4576 	 * The only case that we allow an already imported pool to be
4577 	 * imported again, is when the pool is checkpointed and we want to
4578 	 * look at its checkpointed state from userland tools like zdb.
4579 	 */
4580 #ifdef _KERNEL
4581 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
4582 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
4583 	    spa_guid_exists(pool_guid, 0)) {
4584 #else
4585 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
4586 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
4587 	    spa_guid_exists(pool_guid, 0) &&
4588 	    !spa_importing_readonly_checkpoint(spa)) {
4589 #endif
4590 		spa_load_failed(spa, "a pool with guid %llu is already open",
4591 		    (u_longlong_t)pool_guid);
4592 		return (SET_ERROR(EEXIST));
4593 	}
4594 
4595 	spa->spa_config_guid = pool_guid;
4596 
4597 	nvlist_free(spa->spa_load_info);
4598 	spa->spa_load_info = fnvlist_alloc();
4599 
4600 	ASSERT0P(spa->spa_comment);
4601 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
4602 		spa->spa_comment = spa_strdup(comment);
4603 
4604 	ASSERT0P(spa->spa_compatibility);
4605 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
4606 	    &compatibility) == 0)
4607 		spa->spa_compatibility = spa_strdup(compatibility);
4608 
4609 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4610 	    &spa->spa_config_txg);
4611 
4612 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
4613 		spa->spa_config_splitting = fnvlist_dup(nvl);
4614 
4615 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
4616 		spa_load_failed(spa, "invalid config provided: '%s' missing",
4617 		    ZPOOL_CONFIG_VDEV_TREE);
4618 		return (SET_ERROR(EINVAL));
4619 	}
4620 
4621 	/*
4622 	 * Create "The Godfather" zio to hold all async IOs
4623 	 */
4624 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
4625 	    KM_SLEEP);
4626 	for (int i = 0; i < max_ncpus; i++) {
4627 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4628 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4629 		    ZIO_FLAG_GODFATHER);
4630 	}
4631 
4632 	/*
4633 	 * Parse the configuration into a vdev tree.  We explicitly set the
4634 	 * value that will be returned by spa_version() since parsing the
4635 	 * configuration requires knowing the version number.
4636 	 */
4637 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4638 	parse = (type == SPA_IMPORT_EXISTING ?
4639 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
4640 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
4641 	spa_config_exit(spa, SCL_ALL, FTAG);
4642 
4643 	if (error != 0) {
4644 		spa_load_failed(spa, "unable to parse config [error=%d]",
4645 		    error);
4646 		return (error);
4647 	}
4648 
4649 	ASSERT(spa->spa_root_vdev == rvd);
4650 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
4651 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
4652 
4653 	if (type != SPA_IMPORT_ASSEMBLE) {
4654 		ASSERT(spa_guid(spa) == pool_guid);
4655 	}
4656 
4657 	return (0);
4658 }
4659 
4660 /*
4661  * Recursively open all vdevs in the vdev tree. This function is called twice:
4662  * first with the untrusted config, then with the trusted config.
4663  */
4664 static int
4665 spa_ld_open_vdevs(spa_t *spa)
4666 {
4667 	int error = 0;
4668 
4669 	/*
4670 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
4671 	 * missing/unopenable for the root vdev to be still considered openable.
4672 	 */
4673 	if (spa->spa_trust_config) {
4674 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
4675 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
4676 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
4677 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
4678 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
4679 	} else {
4680 		spa->spa_missing_tvds_allowed = 0;
4681 	}
4682 
4683 	spa->spa_missing_tvds_allowed =
4684 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
4685 
4686 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4687 	error = vdev_open(spa->spa_root_vdev);
4688 	spa_config_exit(spa, SCL_ALL, FTAG);
4689 
4690 	if (spa->spa_missing_tvds != 0) {
4691 		spa_load_note(spa, "vdev tree has %lld missing top-level "
4692 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
4693 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
4694 			/*
4695 			 * Although theoretically we could allow users to open
4696 			 * incomplete pools in RW mode, we'd need to add a lot
4697 			 * of extra logic (e.g. adjust pool space to account
4698 			 * for missing vdevs).
4699 			 * This limitation also prevents users from accidentally
4700 			 * opening the pool in RW mode during data recovery and
4701 			 * damaging it further.
4702 			 */
4703 			spa_load_note(spa, "pools with missing top-level "
4704 			    "vdevs can only be opened in read-only mode.");
4705 			error = SET_ERROR(ENXIO);
4706 		} else {
4707 			spa_load_note(spa, "current settings allow for maximum "
4708 			    "%lld missing top-level vdevs at this stage.",
4709 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
4710 		}
4711 	}
4712 	if (error != 0) {
4713 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
4714 		    error);
4715 	}
4716 	if (spa->spa_missing_tvds != 0 || error != 0)
4717 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
4718 
4719 	return (error);
4720 }
4721 
4722 /*
4723  * We need to validate the vdev labels against the configuration that
4724  * we have in hand. This function is called twice: first with an untrusted
4725  * config, then with a trusted config. The validation is more strict when the
4726  * config is trusted.
4727  */
4728 static int
4729 spa_ld_validate_vdevs(spa_t *spa)
4730 {
4731 	int error = 0;
4732 	vdev_t *rvd = spa->spa_root_vdev;
4733 
4734 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4735 	error = vdev_validate(rvd);
4736 	spa_config_exit(spa, SCL_ALL, FTAG);
4737 
4738 	if (error != 0) {
4739 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
4740 		return (error);
4741 	}
4742 
4743 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
4744 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
4745 		    "some vdevs");
4746 		vdev_dbgmsg_print_tree(rvd, 2);
4747 		return (SET_ERROR(ENXIO));
4748 	}
4749 
4750 	return (0);
4751 }
4752 
4753 static void
4754 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
4755 {
4756 	spa->spa_state = POOL_STATE_ACTIVE;
4757 	spa->spa_ubsync = spa->spa_uberblock;
4758 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
4759 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
4760 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
4761 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
4762 	spa->spa_claim_max_txg = spa->spa_first_txg;
4763 	spa->spa_prev_software_version = ub->ub_software_version;
4764 }
4765 
4766 static int
4767 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
4768 {
4769 	vdev_t *rvd = spa->spa_root_vdev;
4770 	nvlist_t *label;
4771 	uberblock_t *ub = &spa->spa_uberblock;
4772 
4773 	/*
4774 	 * If we are opening the checkpointed state of the pool by
4775 	 * rewinding to it, at this point we will have written the
4776 	 * checkpointed uberblock to the vdev labels, so searching
4777 	 * the labels will find the right uberblock.  However, if
4778 	 * we are opening the checkpointed state read-only, we have
4779 	 * not modified the labels. Therefore, we must ignore the
4780 	 * labels and continue using the spa_uberblock that was set
4781 	 * by spa_ld_checkpoint_rewind.
4782 	 *
4783 	 * Note that it would be fine to ignore the labels when
4784 	 * rewinding (opening writeable) as well. However, if we
4785 	 * crash just after writing the labels, we will end up
4786 	 * searching the labels. Doing so in the common case means
4787 	 * that this code path gets exercised normally, rather than
4788 	 * just in the edge case.
4789 	 */
4790 	if (ub->ub_checkpoint_txg != 0 &&
4791 	    spa_importing_readonly_checkpoint(spa)) {
4792 		spa_ld_select_uberblock_done(spa, ub);
4793 		return (0);
4794 	}
4795 
4796 	/*
4797 	 * Find the best uberblock.
4798 	 */
4799 	vdev_uberblock_load(rvd, ub, &label);
4800 
4801 	/*
4802 	 * If we weren't able to find a single valid uberblock, return failure.
4803 	 */
4804 	if (ub->ub_txg == 0) {
4805 		nvlist_free(label);
4806 		spa_load_failed(spa, "no valid uberblock found");
4807 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
4808 	}
4809 
4810 	if (spa->spa_load_max_txg != UINT64_MAX) {
4811 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
4812 		    (u_longlong_t)spa->spa_load_max_txg);
4813 	}
4814 	spa_load_note(spa, "using uberblock with txg=%llu",
4815 	    (u_longlong_t)ub->ub_txg);
4816 	if (ub->ub_raidz_reflow_info != 0) {
4817 		spa_load_note(spa, "uberblock raidz_reflow_info: "
4818 		    "state=%u offset=%llu",
4819 		    (int)RRSS_GET_STATE(ub),
4820 		    (u_longlong_t)RRSS_GET_OFFSET(ub));
4821 	}
4822 
4823 	/*
4824 	 * For pools which have the multihost property on determine if the
4825 	 * pool is truly inactive and can be safely imported.  Prevent
4826 	 * hosts which don't have a hostid set from importing the pool.
4827 	 */
4828 	spa->spa_activity_check = spa_activity_check_required(spa, ub, label);
4829 	if (spa->spa_activity_check) {
4830 		int error = spa_ld_activity_check(spa, ub, label);
4831 		if (error) {
4832 			spa_load_state_t state = spa->spa_load_state;
4833 			error = spa_ld_activity_result(spa, error,
4834 			    state == SPA_LOAD_TRYIMPORT ? "tryimport" :
4835 			    state == SPA_LOAD_IMPORT ? "import" : "open");
4836 			nvlist_free(label);
4837 			return (error);
4838 		}
4839 	} else {
4840 		fnvlist_add_uint32(spa->spa_load_info,
4841 		    ZPOOL_CONFIG_MMP_RESULT, ESRCH);
4842 	}
4843 
4844 	/*
4845 	 * If the pool has an unsupported version we can't open it.
4846 	 */
4847 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
4848 		nvlist_free(label);
4849 		spa_load_failed(spa, "version %llu is not supported",
4850 		    (u_longlong_t)ub->ub_version);
4851 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
4852 	}
4853 
4854 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
4855 		nvlist_t *features;
4856 
4857 		/*
4858 		 * If we weren't able to find what's necessary for reading the
4859 		 * MOS in the label, return failure.
4860 		 */
4861 		if (label == NULL) {
4862 			spa_load_failed(spa, "label config unavailable");
4863 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4864 			    ENXIO));
4865 		}
4866 
4867 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
4868 		    &features) != 0) {
4869 			nvlist_free(label);
4870 			spa_load_failed(spa, "invalid label: '%s' missing",
4871 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
4872 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4873 			    ENXIO));
4874 		}
4875 
4876 		/*
4877 		 * Update our in-core representation with the definitive values
4878 		 * from the label.
4879 		 */
4880 		nvlist_free(spa->spa_label_features);
4881 		spa->spa_label_features = fnvlist_dup(features);
4882 	}
4883 
4884 	nvlist_free(label);
4885 
4886 	/*
4887 	 * Look through entries in the label nvlist's features_for_read. If
4888 	 * there is a feature listed there which we don't understand then we
4889 	 * cannot open a pool.
4890 	 */
4891 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
4892 		nvlist_t *unsup_feat;
4893 
4894 		unsup_feat = fnvlist_alloc();
4895 
4896 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
4897 		    NULL); nvp != NULL;
4898 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
4899 			if (!zfeature_is_supported(nvpair_name(nvp))) {
4900 				fnvlist_add_string(unsup_feat,
4901 				    nvpair_name(nvp), "");
4902 			}
4903 		}
4904 
4905 		if (!nvlist_empty(unsup_feat)) {
4906 			fnvlist_add_nvlist(spa->spa_load_info,
4907 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
4908 			nvlist_free(unsup_feat);
4909 			spa_load_failed(spa, "some features are unsupported");
4910 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
4911 			    ENOTSUP));
4912 		}
4913 
4914 		nvlist_free(unsup_feat);
4915 	}
4916 
4917 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
4918 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4919 		spa_try_repair(spa, spa->spa_config);
4920 		spa_config_exit(spa, SCL_ALL, FTAG);
4921 		nvlist_free(spa->spa_config_splitting);
4922 		spa->spa_config_splitting = NULL;
4923 	}
4924 
4925 	/*
4926 	 * Initialize internal SPA structures.
4927 	 */
4928 	spa_ld_select_uberblock_done(spa, ub);
4929 
4930 	return (0);
4931 }
4932 
4933 static int
4934 spa_ld_open_rootbp(spa_t *spa)
4935 {
4936 	int error = 0;
4937 	vdev_t *rvd = spa->spa_root_vdev;
4938 
4939 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
4940 	if (error != 0) {
4941 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
4942 		    "[error=%d]", error);
4943 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4944 	}
4945 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
4946 
4947 	return (0);
4948 }
4949 
4950 static int
4951 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
4952     boolean_t reloading)
4953 {
4954 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
4955 	nvlist_t *nv, *mos_config, *policy;
4956 	int error = 0, copy_error;
4957 	uint64_t healthy_tvds, healthy_tvds_mos;
4958 	uint64_t mos_config_txg;
4959 
4960 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
4961 	    != 0)
4962 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4963 
4964 	/*
4965 	 * If we're assembling a pool from a split, the config provided is
4966 	 * already trusted so there is nothing to do.
4967 	 */
4968 	if (type == SPA_IMPORT_ASSEMBLE)
4969 		return (0);
4970 
4971 	healthy_tvds = spa_healthy_core_tvds(spa);
4972 
4973 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
4974 	    != 0) {
4975 		spa_load_failed(spa, "unable to retrieve MOS config");
4976 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4977 	}
4978 
4979 	/*
4980 	 * If we are doing an open, pool owner wasn't verified yet, thus do
4981 	 * the verification here.
4982 	 */
4983 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
4984 		error = spa_verify_host(spa, mos_config);
4985 		if (error != 0) {
4986 			nvlist_free(mos_config);
4987 			return (error);
4988 		}
4989 	}
4990 
4991 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
4992 
4993 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4994 
4995 	/*
4996 	 * Build a new vdev tree from the trusted config
4997 	 */
4998 	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
4999 	if (error != 0) {
5000 		nvlist_free(mos_config);
5001 		spa_config_exit(spa, SCL_ALL, FTAG);
5002 		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
5003 		    error);
5004 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
5005 	}
5006 
5007 	/*
5008 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
5009 	 * obtained by scanning /dev/dsk, then it will have the right vdev
5010 	 * paths. We update the trusted MOS config with this information.
5011 	 * We first try to copy the paths with vdev_copy_path_strict, which
5012 	 * succeeds only when both configs have exactly the same vdev tree.
5013 	 * If that fails, we fall back to a more flexible method that has a
5014 	 * best effort policy.
5015 	 */
5016 	copy_error = vdev_copy_path_strict(rvd, mrvd);
5017 	if (copy_error != 0 || spa_load_print_vdev_tree) {
5018 		spa_load_note(spa, "provided vdev tree:");
5019 		vdev_dbgmsg_print_tree(rvd, 2);
5020 		spa_load_note(spa, "MOS vdev tree:");
5021 		vdev_dbgmsg_print_tree(mrvd, 2);
5022 	}
5023 	if (copy_error != 0) {
5024 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
5025 		    "back to vdev_copy_path_relaxed");
5026 		vdev_copy_path_relaxed(rvd, mrvd);
5027 	}
5028 
5029 	vdev_close(rvd);
5030 	vdev_free(rvd);
5031 	spa->spa_root_vdev = mrvd;
5032 	rvd = mrvd;
5033 	spa_config_exit(spa, SCL_ALL, FTAG);
5034 
5035 	/*
5036 	 * If 'zpool import' used a cached config, then the on-disk hostid and
5037 	 * hostname may be different to the cached config in ways that should
5038 	 * prevent import.  Userspace can't discover this without a scan, but
5039 	 * we know, so we add these values to LOAD_INFO so the caller can know
5040 	 * the difference.
5041 	 *
5042 	 * Note that we have to do this before the config is regenerated,
5043 	 * because the new config will have the hostid and hostname for this
5044 	 * host, in readiness for import.
5045 	 */
5046 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
5047 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
5048 		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
5049 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
5050 		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
5051 		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
5052 
5053 	/*
5054 	 * We will use spa_config if we decide to reload the spa or if spa_load
5055 	 * fails and we rewind. We must thus regenerate the config using the
5056 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
5057 	 * pass settings on how to load the pool and is not stored in the MOS.
5058 	 * We copy it over to our new, trusted config.
5059 	 */
5060 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
5061 	    ZPOOL_CONFIG_POOL_TXG);
5062 	nvlist_free(mos_config);
5063 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
5064 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
5065 	    &policy) == 0)
5066 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
5067 	spa_config_set(spa, mos_config);
5068 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
5069 
5070 	/*
5071 	 * Now that we got the config from the MOS, we should be more strict
5072 	 * in checking blkptrs and can make assumptions about the consistency
5073 	 * of the vdev tree. spa_trust_config must be set to true before opening
5074 	 * vdevs in order for them to be writeable.
5075 	 */
5076 	spa->spa_trust_config = B_TRUE;
5077 
5078 	/*
5079 	 * Open and validate the new vdev tree
5080 	 */
5081 	error = spa_ld_open_vdevs(spa);
5082 	if (error != 0)
5083 		return (error);
5084 
5085 	error = spa_ld_validate_vdevs(spa);
5086 	if (error != 0)
5087 		return (error);
5088 
5089 	if (copy_error != 0 || spa_load_print_vdev_tree) {
5090 		spa_load_note(spa, "final vdev tree:");
5091 		vdev_dbgmsg_print_tree(rvd, 2);
5092 	}
5093 
5094 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
5095 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
5096 		/*
5097 		 * Sanity check to make sure that we are indeed loading the
5098 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
5099 		 * in the config provided and they happened to be the only ones
5100 		 * to have the latest uberblock, we could involuntarily perform
5101 		 * an extreme rewind.
5102 		 */
5103 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
5104 		if (healthy_tvds_mos - healthy_tvds >=
5105 		    SPA_SYNC_MIN_VDEVS) {
5106 			spa_load_note(spa, "config provided misses too many "
5107 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
5108 			    (u_longlong_t)healthy_tvds,
5109 			    (u_longlong_t)healthy_tvds_mos);
5110 			spa_load_note(spa, "vdev tree:");
5111 			vdev_dbgmsg_print_tree(rvd, 2);
5112 			if (reloading) {
5113 				spa_load_failed(spa, "config was already "
5114 				    "provided from MOS. Aborting.");
5115 				return (spa_vdev_err(rvd,
5116 				    VDEV_AUX_CORRUPT_DATA, EIO));
5117 			}
5118 			spa_load_note(spa, "spa must be reloaded using MOS "
5119 			    "config");
5120 			return (SET_ERROR(EAGAIN));
5121 		}
5122 	}
5123 
5124 	/*
5125 	 * Final sanity check for multihost pools that no other host is
5126 	 * accessing the pool.  All of the read-only check have passed at
5127 	 * this point, perform targetted updates to the mmp uberblocks to
5128 	 * safely force a visible change.
5129 	 */
5130 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
5131 	    !spa->spa_extreme_rewind && spa->spa_activity_check) {
5132 
5133 		error = spa_activity_check_claim(spa);
5134 		error = spa_ld_activity_result(spa, error, "claim");
5135 
5136 		if (error == EREMOTEIO)
5137 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
5138 		else if (error)
5139 			return (error);
5140 	}
5141 
5142 	error = spa_check_for_missing_logs(spa);
5143 	if (error != 0)
5144 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
5145 
5146 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
5147 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
5148 		    "guid sum (%llu != %llu)",
5149 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
5150 		    (u_longlong_t)rvd->vdev_guid_sum);
5151 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
5152 		    ENXIO));
5153 	}
5154 
5155 	return (0);
5156 }
5157 
5158 static int
5159 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
5160 {
5161 	int error = 0;
5162 	vdev_t *rvd = spa->spa_root_vdev;
5163 
5164 	/*
5165 	 * Everything that we read before spa_remove_init() must be stored
5166 	 * on concreted vdevs.  Therefore we do this as early as possible.
5167 	 */
5168 	error = spa_remove_init(spa);
5169 	if (error != 0) {
5170 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
5171 		    error);
5172 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5173 	}
5174 
5175 	/*
5176 	 * Retrieve information needed to condense indirect vdev mappings.
5177 	 */
5178 	error = spa_condense_init(spa);
5179 	if (error != 0) {
5180 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
5181 		    error);
5182 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
5183 	}
5184 
5185 	return (0);
5186 }
5187 
5188 static int
5189 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
5190 {
5191 	int error = 0;
5192 	vdev_t *rvd = spa->spa_root_vdev;
5193 
5194 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
5195 		boolean_t missing_feat_read = B_FALSE;
5196 		nvlist_t *unsup_feat, *enabled_feat;
5197 
5198 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
5199 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
5200 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5201 		}
5202 
5203 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
5204 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
5205 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5206 		}
5207 
5208 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
5209 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
5210 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5211 		}
5212 
5213 		enabled_feat = fnvlist_alloc();
5214 		unsup_feat = fnvlist_alloc();
5215 
5216 		if (!spa_features_check(spa, B_FALSE,
5217 		    unsup_feat, enabled_feat))
5218 			missing_feat_read = B_TRUE;
5219 
5220 		if (spa_writeable(spa) ||
5221 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
5222 			if (!spa_features_check(spa, B_TRUE,
5223 			    unsup_feat, enabled_feat)) {
5224 				*missing_feat_writep = B_TRUE;
5225 			}
5226 		}
5227 
5228 		fnvlist_add_nvlist(spa->spa_load_info,
5229 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
5230 
5231 		if (!nvlist_empty(unsup_feat)) {
5232 			fnvlist_add_nvlist(spa->spa_load_info,
5233 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
5234 		}
5235 
5236 		fnvlist_free(enabled_feat);
5237 		fnvlist_free(unsup_feat);
5238 
5239 		if (!missing_feat_read) {
5240 			fnvlist_add_boolean(spa->spa_load_info,
5241 			    ZPOOL_CONFIG_CAN_RDONLY);
5242 		}
5243 
5244 		/*
5245 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
5246 		 * twofold: to determine whether the pool is available for
5247 		 * import in read-write mode and (if it is not) whether the
5248 		 * pool is available for import in read-only mode. If the pool
5249 		 * is available for import in read-write mode, it is displayed
5250 		 * as available in userland; if it is not available for import
5251 		 * in read-only mode, it is displayed as unavailable in
5252 		 * userland. If the pool is available for import in read-only
5253 		 * mode but not read-write mode, it is displayed as unavailable
5254 		 * in userland with a special note that the pool is actually
5255 		 * available for open in read-only mode.
5256 		 *
5257 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
5258 		 * missing a feature for write, we must first determine whether
5259 		 * the pool can be opened read-only before returning to
5260 		 * userland in order to know whether to display the
5261 		 * abovementioned note.
5262 		 */
5263 		if (missing_feat_read || (*missing_feat_writep &&
5264 		    spa_writeable(spa))) {
5265 			spa_load_failed(spa, "pool uses unsupported features");
5266 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
5267 			    ENOTSUP));
5268 		}
5269 
5270 		/*
5271 		 * Load refcounts for ZFS features from disk into an in-memory
5272 		 * cache during SPA initialization.
5273 		 */
5274 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
5275 			uint64_t refcount;
5276 
5277 			error = feature_get_refcount_from_disk(spa,
5278 			    &spa_feature_table[i], &refcount);
5279 			if (error == 0) {
5280 				spa->spa_feat_refcount_cache[i] = refcount;
5281 			} else if (error == ENOTSUP) {
5282 				spa->spa_feat_refcount_cache[i] =
5283 				    SPA_FEATURE_DISABLED;
5284 			} else {
5285 				spa_load_failed(spa, "error getting refcount "
5286 				    "for feature %s [error=%d]",
5287 				    spa_feature_table[i].fi_guid, error);
5288 				return (spa_vdev_err(rvd,
5289 				    VDEV_AUX_CORRUPT_DATA, EIO));
5290 			}
5291 		}
5292 	}
5293 
5294 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
5295 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
5296 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
5297 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5298 	}
5299 
5300 	/*
5301 	 * Encryption was added before bookmark_v2, even though bookmark_v2
5302 	 * is now a dependency. If this pool has encryption enabled without
5303 	 * bookmark_v2, trigger an errata message.
5304 	 */
5305 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
5306 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
5307 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
5308 	}
5309 
5310 	return (0);
5311 }
5312 
5313 static int
5314 spa_ld_load_special_directories(spa_t *spa)
5315 {
5316 	int error = 0;
5317 	vdev_t *rvd = spa->spa_root_vdev;
5318 
5319 	spa->spa_is_initializing = B_TRUE;
5320 	error = dsl_pool_open(spa->spa_dsl_pool);
5321 	spa->spa_is_initializing = B_FALSE;
5322 	if (error != 0) {
5323 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
5324 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5325 	}
5326 
5327 	return (0);
5328 }
5329 
5330 static int
5331 spa_ld_get_props(spa_t *spa)
5332 {
5333 	int error = 0;
5334 	uint64_t obj;
5335 	vdev_t *rvd = spa->spa_root_vdev;
5336 
5337 	/* Grab the checksum salt from the MOS. */
5338 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5339 	    DMU_POOL_CHECKSUM_SALT, 1,
5340 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
5341 	    spa->spa_cksum_salt.zcs_bytes);
5342 	if (error == ENOENT) {
5343 		/* Generate a new salt for subsequent use */
5344 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
5345 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
5346 	} else if (error != 0) {
5347 		spa_load_failed(spa, "unable to retrieve checksum salt from "
5348 		    "MOS [error=%d]", error);
5349 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5350 	}
5351 
5352 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
5353 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5354 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
5355 	if (error != 0) {
5356 		spa_load_failed(spa, "error opening deferred-frees bpobj "
5357 		    "[error=%d]", error);
5358 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5359 	}
5360 
5361 	/*
5362 	 * Load the bit that tells us to use the new accounting function
5363 	 * (raid-z deflation).  If we have an older pool, this will not
5364 	 * be present.
5365 	 */
5366 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
5367 	if (error != 0 && error != ENOENT)
5368 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5369 
5370 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
5371 	    &spa->spa_creation_version, B_FALSE);
5372 	if (error != 0 && error != ENOENT)
5373 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5374 
5375 	/* Load time log */
5376 	spa_load_txg_log_time(spa);
5377 
5378 	/*
5379 	 * Load the persistent error log.  If we have an older pool, this will
5380 	 * not be present.
5381 	 */
5382 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
5383 	    B_FALSE);
5384 	if (error != 0 && error != ENOENT)
5385 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5386 
5387 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
5388 	    &spa->spa_errlog_scrub, B_FALSE);
5389 	if (error != 0 && error != ENOENT)
5390 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5391 
5392 	/* Load the last scrubbed txg. */
5393 	error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG,
5394 	    &spa->spa_scrubbed_last_txg, B_FALSE);
5395 	if (error != 0 && error != ENOENT)
5396 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5397 
5398 	/*
5399 	 * Load the livelist deletion field. If a livelist is queued for
5400 	 * deletion, indicate that in the spa
5401 	 */
5402 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
5403 	    &spa->spa_livelists_to_delete, B_FALSE);
5404 	if (error != 0 && error != ENOENT)
5405 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5406 
5407 	/*
5408 	 * Load the history object.  If we have an older pool, this
5409 	 * will not be present.
5410 	 */
5411 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
5412 	if (error != 0 && error != ENOENT)
5413 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5414 
5415 	/*
5416 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
5417 	 * be present; in this case, defer its creation to a later time to
5418 	 * avoid dirtying the MOS this early / out of sync context. See
5419 	 * spa_sync_config_object.
5420 	 */
5421 
5422 	/* The sentinel is only available in the MOS config. */
5423 	nvlist_t *mos_config;
5424 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
5425 		spa_load_failed(spa, "unable to retrieve MOS config");
5426 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5427 	}
5428 
5429 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
5430 	    &spa->spa_all_vdev_zaps, B_FALSE);
5431 
5432 	if (error == ENOENT) {
5433 		VERIFY(!nvlist_exists(mos_config,
5434 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
5435 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
5436 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
5437 	} else if (error != 0) {
5438 		nvlist_free(mos_config);
5439 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5440 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
5441 		/*
5442 		 * An older version of ZFS overwrote the sentinel value, so
5443 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
5444 		 * destruction to later; see spa_sync_config_object.
5445 		 */
5446 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
5447 		/*
5448 		 * We're assuming that no vdevs have had their ZAPs created
5449 		 * before this. Better be sure of it.
5450 		 */
5451 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
5452 	}
5453 	nvlist_free(mos_config);
5454 
5455 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
5456 
5457 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
5458 	    B_FALSE);
5459 	if (error && error != ENOENT)
5460 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5461 
5462 	if (error == 0) {
5463 		uint64_t autoreplace = 0;
5464 
5465 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
5466 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
5467 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
5468 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
5469 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
5470 		spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
5471 		    &spa->spa_dedup_table_quota);
5472 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
5473 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
5474 		spa->spa_autoreplace = (autoreplace != 0);
5475 	}
5476 
5477 	/*
5478 	 * If we are importing a pool with missing top-level vdevs,
5479 	 * we enforce that the pool doesn't panic or get suspended on
5480 	 * error since the likelihood of missing data is extremely high.
5481 	 */
5482 	if (spa->spa_missing_tvds > 0 &&
5483 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
5484 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
5485 		spa_load_note(spa, "forcing failmode to 'continue' "
5486 		    "as some top level vdevs are missing");
5487 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
5488 	}
5489 
5490 	return (0);
5491 }
5492 
5493 static int
5494 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
5495 {
5496 	int error = 0;
5497 	vdev_t *rvd = spa->spa_root_vdev;
5498 
5499 	/*
5500 	 * If we're assembling the pool from the split-off vdevs of
5501 	 * an existing pool, we don't want to attach the spares & cache
5502 	 * devices.
5503 	 */
5504 
5505 	/*
5506 	 * Load any hot spares for this pool.
5507 	 */
5508 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
5509 	    B_FALSE);
5510 	if (error != 0 && error != ENOENT)
5511 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5512 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
5513 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
5514 		if (load_nvlist(spa, spa->spa_spares.sav_object,
5515 		    &spa->spa_spares.sav_config) != 0) {
5516 			spa_load_failed(spa, "error loading spares nvlist");
5517 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5518 		}
5519 
5520 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5521 		spa_load_spares(spa);
5522 		spa_config_exit(spa, SCL_ALL, FTAG);
5523 	} else if (error == 0) {
5524 		spa->spa_spares.sav_sync = B_TRUE;
5525 	}
5526 
5527 	/*
5528 	 * Load any level 2 ARC devices for this pool.
5529 	 */
5530 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
5531 	    &spa->spa_l2cache.sav_object, B_FALSE);
5532 	if (error != 0 && error != ENOENT)
5533 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5534 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
5535 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
5536 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
5537 		    &spa->spa_l2cache.sav_config) != 0) {
5538 			spa_load_failed(spa, "error loading l2cache nvlist");
5539 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5540 		}
5541 
5542 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5543 		spa_load_l2cache(spa);
5544 		spa_config_exit(spa, SCL_ALL, FTAG);
5545 	} else if (error == 0) {
5546 		spa->spa_l2cache.sav_sync = B_TRUE;
5547 	}
5548 
5549 	return (0);
5550 }
5551 
5552 static int
5553 spa_ld_load_vdev_metadata(spa_t *spa)
5554 {
5555 	int error = 0;
5556 	vdev_t *rvd = spa->spa_root_vdev;
5557 
5558 	/*
5559 	 * If the 'multihost' property is set, then never allow a pool to
5560 	 * be imported when the system hostid is zero.  The exception to
5561 	 * this rule is zdb which is always allowed to access pools.
5562 	 */
5563 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
5564 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
5565 		fnvlist_add_uint64(spa->spa_load_info,
5566 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
5567 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
5568 	}
5569 
5570 	/*
5571 	 * If the 'autoreplace' property is set, then post a resource notifying
5572 	 * the ZFS DE that it should not issue any faults for unopenable
5573 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
5574 	 * unopenable vdevs so that the normal autoreplace handler can take
5575 	 * over.
5576 	 */
5577 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
5578 		spa_check_removed(spa->spa_root_vdev);
5579 		/*
5580 		 * For the import case, this is done in spa_import(), because
5581 		 * at this point we're using the spare definitions from
5582 		 * the MOS config, not necessarily from the userland config.
5583 		 */
5584 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
5585 			spa_aux_check_removed(&spa->spa_spares);
5586 			spa_aux_check_removed(&spa->spa_l2cache);
5587 		}
5588 	}
5589 
5590 	/*
5591 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
5592 	 */
5593 	error = vdev_load(rvd);
5594 	if (error != 0) {
5595 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
5596 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
5597 	}
5598 
5599 	error = spa_ld_log_spacemaps(spa);
5600 	if (error != 0) {
5601 		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
5602 		    error);
5603 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
5604 	}
5605 
5606 	/*
5607 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
5608 	 */
5609 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5610 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
5611 	spa_config_exit(spa, SCL_ALL, FTAG);
5612 
5613 	return (0);
5614 }
5615 
5616 static int
5617 spa_ld_load_dedup_tables(spa_t *spa)
5618 {
5619 	int error = 0;
5620 	vdev_t *rvd = spa->spa_root_vdev;
5621 
5622 	error = ddt_load(spa);
5623 	if (error != 0) {
5624 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
5625 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5626 	}
5627 
5628 	return (0);
5629 }
5630 
5631 static int
5632 spa_ld_load_brt(spa_t *spa)
5633 {
5634 	int error = 0;
5635 	vdev_t *rvd = spa->spa_root_vdev;
5636 
5637 	error = brt_load(spa);
5638 	if (error != 0) {
5639 		spa_load_failed(spa, "brt_load failed [error=%d]", error);
5640 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
5641 	}
5642 
5643 	return (0);
5644 }
5645 
5646 static int
5647 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
5648 {
5649 	vdev_t *rvd = spa->spa_root_vdev;
5650 
5651 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
5652 		boolean_t missing = spa_check_logs(spa);
5653 		if (missing) {
5654 			if (spa->spa_missing_tvds != 0) {
5655 				spa_load_note(spa, "spa_check_logs failed "
5656 				    "so dropping the logs");
5657 			} else {
5658 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
5659 				spa_load_failed(spa, "spa_check_logs failed");
5660 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
5661 				    ENXIO));
5662 			}
5663 		}
5664 	}
5665 
5666 	return (0);
5667 }
5668 
5669 static int
5670 spa_ld_verify_pool_data(spa_t *spa)
5671 {
5672 	int error = 0;
5673 	vdev_t *rvd = spa->spa_root_vdev;
5674 
5675 	/*
5676 	 * We've successfully opened the pool, verify that we're ready
5677 	 * to start pushing transactions.
5678 	 */
5679 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
5680 		error = spa_load_verify(spa);
5681 		if (error != 0) {
5682 			spa_load_failed(spa, "spa_load_verify failed "
5683 			    "[error=%d]", error);
5684 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
5685 			    error));
5686 		}
5687 	}
5688 
5689 	return (0);
5690 }
5691 
5692 static void
5693 spa_ld_claim_log_blocks(spa_t *spa)
5694 {
5695 	dmu_tx_t *tx;
5696 	dsl_pool_t *dp = spa_get_dsl(spa);
5697 
5698 	/*
5699 	 * Claim log blocks that haven't been committed yet.
5700 	 * This must all happen in a single txg.
5701 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
5702 	 * invoked from zil_claim_log_block()'s i/o done callback.
5703 	 * Price of rollback is that we abandon the log.
5704 	 */
5705 	spa->spa_claiming = B_TRUE;
5706 
5707 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
5708 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
5709 	    zil_claim, tx, DS_FIND_CHILDREN);
5710 	dmu_tx_commit(tx);
5711 
5712 	spa->spa_claiming = B_FALSE;
5713 
5714 	spa_set_log_state(spa, SPA_LOG_GOOD);
5715 }
5716 
5717 static void
5718 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
5719     boolean_t update_config_cache)
5720 {
5721 	vdev_t *rvd = spa->spa_root_vdev;
5722 	int need_update = B_FALSE;
5723 
5724 	/*
5725 	 * If the config cache is stale, or we have uninitialized
5726 	 * metaslabs (see spa_vdev_add()), then update the config.
5727 	 *
5728 	 * If this is a verbatim import, trust the current
5729 	 * in-core spa_config and update the disk labels.
5730 	 */
5731 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
5732 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
5733 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
5734 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
5735 		need_update = B_TRUE;
5736 
5737 	for (int c = 0; c < rvd->vdev_children; c++)
5738 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
5739 			need_update = B_TRUE;
5740 
5741 	/*
5742 	 * Update the config cache asynchronously in case we're the
5743 	 * root pool, in which case the config cache isn't writable yet.
5744 	 */
5745 	if (need_update)
5746 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
5747 }
5748 
5749 static void
5750 spa_ld_prepare_for_reload(spa_t *spa)
5751 {
5752 	spa_mode_t mode = spa->spa_mode;
5753 	int async_suspended = spa->spa_async_suspended;
5754 
5755 	spa_unload(spa);
5756 	spa_deactivate(spa);
5757 	spa_activate(spa, mode);
5758 
5759 	/*
5760 	 * We save the value of spa_async_suspended as it gets reset to 0 by
5761 	 * spa_unload(). We want to restore it back to the original value before
5762 	 * returning as we might be calling spa_async_resume() later.
5763 	 */
5764 	spa->spa_async_suspended = async_suspended;
5765 }
5766 
5767 static int
5768 spa_ld_read_checkpoint_txg(spa_t *spa)
5769 {
5770 	uberblock_t checkpoint;
5771 	int error = 0;
5772 
5773 	ASSERT0(spa->spa_checkpoint_txg);
5774 	ASSERT(spa_namespace_held() ||
5775 	    spa->spa_load_thread == curthread);
5776 
5777 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5778 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
5779 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
5780 
5781 	if (error == ENOENT)
5782 		return (0);
5783 
5784 	if (error != 0)
5785 		return (error);
5786 
5787 	ASSERT3U(checkpoint.ub_txg, !=, 0);
5788 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
5789 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
5790 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
5791 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
5792 
5793 	return (0);
5794 }
5795 
5796 static int
5797 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
5798 {
5799 	int error = 0;
5800 
5801 	ASSERT(spa_namespace_held());
5802 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
5803 
5804 	/*
5805 	 * Never trust the config that is provided unless we are assembling
5806 	 * a pool following a split.
5807 	 * This means don't trust blkptrs and the vdev tree in general. This
5808 	 * also effectively puts the spa in read-only mode since
5809 	 * spa_writeable() checks for spa_trust_config to be true.
5810 	 * We will later load a trusted config from the MOS.
5811 	 */
5812 	if (type != SPA_IMPORT_ASSEMBLE)
5813 		spa->spa_trust_config = B_FALSE;
5814 
5815 	/*
5816 	 * Parse the config provided to create a vdev tree.
5817 	 */
5818 	error = spa_ld_parse_config(spa, type);
5819 	if (error != 0)
5820 		return (error);
5821 
5822 	spa_import_progress_add(spa);
5823 
5824 	/*
5825 	 * Now that we have the vdev tree, try to open each vdev. This involves
5826 	 * opening the underlying physical device, retrieving its geometry and
5827 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
5828 	 * based on the success of those operations. After this we'll be ready
5829 	 * to read from the vdevs.
5830 	 */
5831 	error = spa_ld_open_vdevs(spa);
5832 	if (error != 0)
5833 		return (error);
5834 
5835 	/*
5836 	 * Read the label of each vdev and make sure that the GUIDs stored
5837 	 * there match the GUIDs in the config provided.
5838 	 * If we're assembling a new pool that's been split off from an
5839 	 * existing pool, the labels haven't yet been updated so we skip
5840 	 * validation for now.
5841 	 */
5842 	if (type != SPA_IMPORT_ASSEMBLE) {
5843 		error = spa_ld_validate_vdevs(spa);
5844 		if (error != 0)
5845 			return (error);
5846 	}
5847 
5848 	/*
5849 	 * Read all vdev labels to find the best uberblock (i.e. latest,
5850 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
5851 	 * get the list of features required to read blkptrs in the MOS from
5852 	 * the vdev label with the best uberblock and verify that our version
5853 	 * of zfs supports them all.
5854 	 */
5855 	error = spa_ld_select_uberblock(spa, type);
5856 	if (error != 0)
5857 		return (error);
5858 
5859 	/*
5860 	 * Pass that uberblock to the dsl_pool layer which will open the root
5861 	 * blkptr. This blkptr points to the latest version of the MOS and will
5862 	 * allow us to read its contents.
5863 	 */
5864 	error = spa_ld_open_rootbp(spa);
5865 	if (error != 0)
5866 		return (error);
5867 
5868 	return (0);
5869 }
5870 
5871 static int
5872 spa_ld_checkpoint_rewind(spa_t *spa)
5873 {
5874 	uberblock_t checkpoint;
5875 	int error = 0;
5876 
5877 	ASSERT(spa_namespace_held());
5878 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
5879 
5880 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5881 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
5882 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
5883 
5884 	if (error != 0) {
5885 		spa_load_failed(spa, "unable to retrieve checkpointed "
5886 		    "uberblock from the MOS config [error=%d]", error);
5887 
5888 		if (error == ENOENT)
5889 			error = ZFS_ERR_NO_CHECKPOINT;
5890 
5891 		return (error);
5892 	}
5893 
5894 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
5895 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
5896 
5897 	/*
5898 	 * We need to update the txg and timestamp of the checkpointed
5899 	 * uberblock to be higher than the latest one. This ensures that
5900 	 * the checkpointed uberblock is selected if we were to close and
5901 	 * reopen the pool right after we've written it in the vdev labels.
5902 	 * (also see block comment in vdev_uberblock_compare)
5903 	 */
5904 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
5905 	checkpoint.ub_timestamp = gethrestime_sec();
5906 
5907 	/*
5908 	 * Set current uberblock to be the checkpointed uberblock.
5909 	 */
5910 	spa->spa_uberblock = checkpoint;
5911 
5912 	/*
5913 	 * If we are doing a normal rewind, then the pool is open for
5914 	 * writing and we sync the "updated" checkpointed uberblock to
5915 	 * disk. Once this is done, we've basically rewound the whole
5916 	 * pool and there is no way back.
5917 	 *
5918 	 * There are cases when we don't want to attempt and sync the
5919 	 * checkpointed uberblock to disk because we are opening a
5920 	 * pool as read-only. Specifically, verifying the checkpointed
5921 	 * state with zdb, and importing the checkpointed state to get
5922 	 * a "preview" of its content.
5923 	 */
5924 	if (spa_writeable(spa)) {
5925 		vdev_t *rvd = spa->spa_root_vdev;
5926 
5927 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5928 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
5929 		int svdcount = 0;
5930 		int children = rvd->vdev_children;
5931 		int c0 = random_in_range(children);
5932 
5933 		for (int c = 0; c < children; c++) {
5934 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
5935 
5936 			/* Stop when revisiting the first vdev */
5937 			if (c > 0 && svd[0] == vd)
5938 				break;
5939 
5940 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
5941 			    !vdev_is_concrete(vd))
5942 				continue;
5943 
5944 			svd[svdcount++] = vd;
5945 			if (svdcount == SPA_SYNC_MIN_VDEVS)
5946 				break;
5947 		}
5948 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
5949 		if (error == 0)
5950 			spa->spa_last_synced_guid = rvd->vdev_guid;
5951 		spa_config_exit(spa, SCL_ALL, FTAG);
5952 
5953 		if (error != 0) {
5954 			spa_load_failed(spa, "failed to write checkpointed "
5955 			    "uberblock to the vdev labels [error=%d]", error);
5956 			return (error);
5957 		}
5958 	}
5959 
5960 	return (0);
5961 }
5962 
5963 static int
5964 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
5965     boolean_t *update_config_cache)
5966 {
5967 	int error;
5968 
5969 	/*
5970 	 * Parse the config for pool, open and validate vdevs,
5971 	 * select an uberblock, and use that uberblock to open
5972 	 * the MOS.
5973 	 */
5974 	error = spa_ld_mos_init(spa, type);
5975 	if (error != 0)
5976 		return (error);
5977 
5978 	/*
5979 	 * Retrieve the trusted config stored in the MOS and use it to create
5980 	 * a new, exact version of the vdev tree, then reopen all vdevs.
5981 	 */
5982 	error = spa_ld_trusted_config(spa, type, B_FALSE);
5983 	if (error == EAGAIN) {
5984 		if (update_config_cache != NULL)
5985 			*update_config_cache = B_TRUE;
5986 
5987 		/*
5988 		 * Redo the loading process with the trusted config if it is
5989 		 * too different from the untrusted config.
5990 		 */
5991 		spa_ld_prepare_for_reload(spa);
5992 		spa_load_note(spa, "RELOADING");
5993 		error = spa_ld_mos_init(spa, type);
5994 		if (error != 0)
5995 			return (error);
5996 
5997 		error = spa_ld_trusted_config(spa, type, B_TRUE);
5998 		if (error != 0)
5999 			return (error);
6000 
6001 	} else if (error != 0) {
6002 		return (error);
6003 	}
6004 
6005 	return (0);
6006 }
6007 
6008 /*
6009  * Load an existing storage pool, using the config provided. This config
6010  * describes which vdevs are part of the pool and is later validated against
6011  * partial configs present in each vdev's label and an entire copy of the
6012  * config stored in the MOS.
6013  */
6014 static int
6015 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
6016 {
6017 	int error = 0;
6018 	boolean_t missing_feat_write = B_FALSE;
6019 	boolean_t checkpoint_rewind =
6020 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
6021 	boolean_t update_config_cache = B_FALSE;
6022 	hrtime_t load_start = gethrtime();
6023 
6024 	ASSERT(spa_namespace_held());
6025 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
6026 
6027 	spa_load_note(spa, "LOADING");
6028 
6029 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
6030 	if (error != 0)
6031 		return (error);
6032 
6033 	/*
6034 	 * If we are rewinding to the checkpoint then we need to repeat
6035 	 * everything we've done so far in this function but this time
6036 	 * selecting the checkpointed uberblock and using that to open
6037 	 * the MOS.
6038 	 */
6039 	if (checkpoint_rewind) {
6040 		/*
6041 		 * If we are rewinding to the checkpoint update config cache
6042 		 * anyway.
6043 		 */
6044 		update_config_cache = B_TRUE;
6045 
6046 		/*
6047 		 * Extract the checkpointed uberblock from the current MOS
6048 		 * and use this as the pool's uberblock from now on. If the
6049 		 * pool is imported as writeable we also write the checkpoint
6050 		 * uberblock to the labels, making the rewind permanent.
6051 		 */
6052 		error = spa_ld_checkpoint_rewind(spa);
6053 		if (error != 0)
6054 			return (error);
6055 
6056 		/*
6057 		 * Redo the loading process again with the
6058 		 * checkpointed uberblock.
6059 		 */
6060 		spa_ld_prepare_for_reload(spa);
6061 		spa_load_note(spa, "LOADING checkpointed uberblock");
6062 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
6063 		if (error != 0)
6064 			return (error);
6065 	}
6066 
6067 	/*
6068 	 * Drop the namespace lock for the rest of the function.
6069 	 */
6070 	spa->spa_load_thread = curthread;
6071 	spa_namespace_exit(FTAG);
6072 
6073 	/*
6074 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
6075 	 */
6076 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
6077 	error = spa_ld_read_checkpoint_txg(spa);
6078 	if (error != 0)
6079 		goto fail;
6080 
6081 	/*
6082 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
6083 	 * from the pool and their contents were re-mapped to other vdevs. Note
6084 	 * that everything that we read before this step must have been
6085 	 * rewritten on concrete vdevs after the last device removal was
6086 	 * initiated. Otherwise we could be reading from indirect vdevs before
6087 	 * we have loaded their mappings.
6088 	 */
6089 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
6090 	error = spa_ld_open_indirect_vdev_metadata(spa);
6091 	if (error != 0)
6092 		goto fail;
6093 
6094 	/*
6095 	 * Retrieve the full list of active features from the MOS and check if
6096 	 * they are all supported.
6097 	 */
6098 	spa_import_progress_set_notes(spa, "Checking feature flags");
6099 	error = spa_ld_check_features(spa, &missing_feat_write);
6100 	if (error != 0)
6101 		goto fail;
6102 
6103 	/*
6104 	 * Load several special directories from the MOS needed by the dsl_pool
6105 	 * layer.
6106 	 */
6107 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
6108 	error = spa_ld_load_special_directories(spa);
6109 	if (error != 0)
6110 		goto fail;
6111 
6112 	/*
6113 	 * Retrieve pool properties from the MOS.
6114 	 */
6115 	spa_import_progress_set_notes(spa, "Loading properties");
6116 	error = spa_ld_get_props(spa);
6117 	if (error != 0)
6118 		goto fail;
6119 
6120 	/*
6121 	 * Retrieve the list of auxiliary devices - cache devices and spares -
6122 	 * and open them.
6123 	 */
6124 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
6125 	error = spa_ld_open_aux_vdevs(spa, type);
6126 	if (error != 0)
6127 		goto fail;
6128 
6129 	/*
6130 	 * Load the metadata for all vdevs. Also check if unopenable devices
6131 	 * should be autoreplaced.
6132 	 */
6133 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
6134 	error = spa_ld_load_vdev_metadata(spa);
6135 	if (error != 0)
6136 		goto fail;
6137 
6138 	spa_import_progress_set_notes(spa, "Loading dedup tables");
6139 	error = spa_ld_load_dedup_tables(spa);
6140 	if (error != 0)
6141 		goto fail;
6142 
6143 	spa_import_progress_set_notes(spa, "Loading BRT");
6144 	error = spa_ld_load_brt(spa);
6145 	if (error != 0)
6146 		goto fail;
6147 
6148 	/*
6149 	 * Verify the logs now to make sure we don't have any unexpected errors
6150 	 * when we claim log blocks later.
6151 	 */
6152 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
6153 	error = spa_ld_verify_logs(spa, type, ereport);
6154 	if (error != 0)
6155 		goto fail;
6156 
6157 	if (missing_feat_write) {
6158 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
6159 
6160 		/*
6161 		 * At this point, we know that we can open the pool in
6162 		 * read-only mode but not read-write mode. We now have enough
6163 		 * information and can return to userland.
6164 		 */
6165 		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
6166 		    ENOTSUP);
6167 		goto fail;
6168 	}
6169 
6170 	/*
6171 	 * Traverse the last txgs to make sure the pool was left off in a safe
6172 	 * state. When performing an extreme rewind, we verify the whole pool,
6173 	 * which can take a very long time.
6174 	 */
6175 	spa_import_progress_set_notes(spa, "Verifying pool data");
6176 	error = spa_ld_verify_pool_data(spa);
6177 	if (error != 0)
6178 		goto fail;
6179 
6180 	/*
6181 	 * Calculate the deflated space for the pool. This must be done before
6182 	 * we write anything to the pool because we'd need to update the space
6183 	 * accounting using the deflated sizes.
6184 	 */
6185 	spa_import_progress_set_notes(spa, "Calculating deflated space");
6186 	spa_update_dspace(spa);
6187 
6188 	/*
6189 	 * We have now retrieved all the information we needed to open the
6190 	 * pool. If we are importing the pool in read-write mode, a few
6191 	 * additional steps must be performed to finish the import.
6192 	 */
6193 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
6194 	    spa->spa_load_max_txg == UINT64_MAX)) {
6195 		uint64_t config_cache_txg = spa->spa_config_txg;
6196 
6197 		spa_import_progress_set_notes(spa, "Starting import");
6198 
6199 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
6200 
6201 		/*
6202 		 * Before we do any zio_write's, complete the raidz expansion
6203 		 * scratch space copying, if necessary.
6204 		 */
6205 		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
6206 			vdev_raidz_reflow_copy_scratch(spa);
6207 
6208 		/*
6209 		 * In case of a checkpoint rewind, log the original txg
6210 		 * of the checkpointed uberblock.
6211 		 */
6212 		if (checkpoint_rewind) {
6213 			spa_history_log_internal(spa, "checkpoint rewind",
6214 			    NULL, "rewound state to txg=%llu",
6215 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
6216 		}
6217 
6218 		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
6219 		/*
6220 		 * Traverse the ZIL and claim all blocks.
6221 		 */
6222 		spa_ld_claim_log_blocks(spa);
6223 
6224 		/*
6225 		 * Kick-off the syncing thread.
6226 		 */
6227 		spa->spa_sync_on = B_TRUE;
6228 		txg_sync_start(spa->spa_dsl_pool);
6229 		mmp_thread_start(spa);
6230 
6231 		/*
6232 		 * Wait for all claims to sync.  We sync up to the highest
6233 		 * claimed log block birth time so that claimed log blocks
6234 		 * don't appear to be from the future.  spa_claim_max_txg
6235 		 * will have been set for us by ZIL traversal operations
6236 		 * performed above.
6237 		 */
6238 		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
6239 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
6240 
6241 		/*
6242 		 * Check if we need to request an update of the config. On the
6243 		 * next sync, we would update the config stored in vdev labels
6244 		 * and the cachefile (by default /etc/zfs/zpool.cache).
6245 		 */
6246 		spa_import_progress_set_notes(spa, "Updating configs");
6247 		spa_ld_check_for_config_update(spa, config_cache_txg,
6248 		    update_config_cache);
6249 
6250 		/*
6251 		 * Check if a rebuild was in progress and if so resume it.
6252 		 * Then check all DTLs to see if anything needs resilvering.
6253 		 * The resilver will be deferred if a rebuild was started.
6254 		 */
6255 		spa_import_progress_set_notes(spa, "Starting resilvers");
6256 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
6257 			vdev_rebuild_restart(spa);
6258 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
6259 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
6260 			spa_async_request(spa, SPA_ASYNC_RESILVER);
6261 		}
6262 
6263 		/*
6264 		 * Log the fact that we booted up (so that we can detect if
6265 		 * we rebooted in the middle of an operation).
6266 		 */
6267 		spa_history_log_version(spa, "open", NULL);
6268 
6269 		spa_import_progress_set_notes(spa,
6270 		    "Restarting device removals");
6271 		spa_restart_removal(spa);
6272 		spa_spawn_aux_threads(spa);
6273 
6274 		/*
6275 		 * Delete any inconsistent datasets.
6276 		 *
6277 		 * Note:
6278 		 * Since we may be issuing deletes for clones here,
6279 		 * we make sure to do so after we've spawned all the
6280 		 * auxiliary threads above (from which the livelist
6281 		 * deletion zthr is part of).
6282 		 */
6283 		spa_import_progress_set_notes(spa,
6284 		    "Cleaning up inconsistent objsets");
6285 		(void) dmu_objset_find(spa_name(spa),
6286 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
6287 
6288 		/*
6289 		 * Clean up any stale temporary dataset userrefs.
6290 		 */
6291 		spa_import_progress_set_notes(spa,
6292 		    "Cleaning up temporary userrefs");
6293 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
6294 
6295 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6296 		spa_import_progress_set_notes(spa, "Restarting initialize");
6297 		vdev_initialize_restart(spa->spa_root_vdev);
6298 		spa_import_progress_set_notes(spa, "Restarting TRIM");
6299 		vdev_trim_restart(spa->spa_root_vdev);
6300 		vdev_autotrim_restart(spa);
6301 		spa_config_exit(spa, SCL_CONFIG, FTAG);
6302 		spa_import_progress_set_notes(spa, "Finished importing");
6303 	}
6304 	zio_handle_import_delay(spa, gethrtime() - load_start);
6305 
6306 	spa_import_progress_remove(spa_guid(spa));
6307 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
6308 
6309 	spa_load_note(spa, "LOADED");
6310 fail:
6311 	spa_namespace_enter(FTAG);
6312 	spa->spa_load_thread = NULL;
6313 	spa_namespace_broadcast();
6314 
6315 	return (error);
6316 
6317 }
6318 
6319 static int
6320 spa_load_retry(spa_t *spa, spa_load_state_t state)
6321 {
6322 	spa_mode_t mode = spa->spa_mode;
6323 
6324 	spa_unload(spa);
6325 	spa_deactivate(spa);
6326 
6327 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
6328 
6329 	spa_activate(spa, mode);
6330 	spa_async_suspend(spa);
6331 
6332 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
6333 	    (u_longlong_t)spa->spa_load_max_txg);
6334 
6335 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
6336 }
6337 
6338 /*
6339  * If spa_load() fails this function will try loading prior txg's. If
6340  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
6341  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
6342  * function will not rewind the pool and will return the same error as
6343  * spa_load().
6344  */
6345 static int
6346 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
6347     int rewind_flags)
6348 {
6349 	nvlist_t *loadinfo = NULL;
6350 	nvlist_t *config = NULL;
6351 	int load_error, rewind_error;
6352 	uint64_t safe_rewind_txg;
6353 	uint64_t min_txg;
6354 
6355 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
6356 		spa->spa_load_max_txg = spa->spa_load_txg;
6357 		spa_set_log_state(spa, SPA_LOG_CLEAR);
6358 	} else {
6359 		spa->spa_load_max_txg = max_request;
6360 		if (max_request != UINT64_MAX)
6361 			spa->spa_extreme_rewind = B_TRUE;
6362 	}
6363 
6364 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
6365 	if (load_error == 0)
6366 		return (0);
6367 
6368 	/* Do not attempt to load uberblocks from previous txgs when: */
6369 	switch (load_error) {
6370 	case ZFS_ERR_NO_CHECKPOINT:
6371 		/* Attempting checkpoint-rewind on a pool with no checkpoint */
6372 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
6373 		zfs_fallthrough;
6374 	case EREMOTEIO:
6375 		/* MMP determines the pool is active on another host */
6376 		zfs_fallthrough;
6377 	case EBADF:
6378 		/* The config cache is out of sync (vdevs or hostid) */
6379 		zfs_fallthrough;
6380 	case EINTR:
6381 		/* The user interactively interrupted the import */
6382 		spa_import_progress_remove(spa_guid(spa));
6383 		return (load_error);
6384 	}
6385 
6386 	if (spa->spa_root_vdev != NULL)
6387 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
6388 
6389 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
6390 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
6391 
6392 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
6393 		nvlist_free(config);
6394 		spa_import_progress_remove(spa_guid(spa));
6395 		return (load_error);
6396 	}
6397 
6398 	if (state == SPA_LOAD_RECOVER) {
6399 		/* Price of rolling back is discarding txgs, including log */
6400 		spa_set_log_state(spa, SPA_LOG_CLEAR);
6401 	} else {
6402 		/*
6403 		 * If we aren't rolling back save the load info from our first
6404 		 * import attempt so that we can restore it after attempting
6405 		 * to rewind.
6406 		 */
6407 		loadinfo = spa->spa_load_info;
6408 		spa->spa_load_info = fnvlist_alloc();
6409 	}
6410 
6411 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
6412 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
6413 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
6414 	    TXG_INITIAL : safe_rewind_txg;
6415 
6416 	/*
6417 	 * Continue as long as we're finding errors, we're still within
6418 	 * the acceptable rewind range, and we're still finding uberblocks
6419 	 */
6420 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
6421 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
6422 		if (spa->spa_load_max_txg < safe_rewind_txg)
6423 			spa->spa_extreme_rewind = B_TRUE;
6424 		rewind_error = spa_load_retry(spa, state);
6425 	}
6426 
6427 	spa->spa_extreme_rewind = B_FALSE;
6428 	spa->spa_load_max_txg = UINT64_MAX;
6429 
6430 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
6431 		spa_config_set(spa, config);
6432 	else
6433 		nvlist_free(config);
6434 
6435 	if (state == SPA_LOAD_RECOVER) {
6436 		ASSERT0P(loadinfo);
6437 		spa_import_progress_remove(spa_guid(spa));
6438 		return (rewind_error);
6439 	} else {
6440 		/* Store the rewind info as part of the initial load info */
6441 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
6442 		    spa->spa_load_info);
6443 
6444 		/* Restore the initial load info */
6445 		fnvlist_free(spa->spa_load_info);
6446 		spa->spa_load_info = loadinfo;
6447 
6448 		spa_import_progress_remove(spa_guid(spa));
6449 		return (load_error);
6450 	}
6451 }
6452 
6453 /*
6454  * Pool Open/Import
6455  *
6456  * The import case is identical to an open except that the configuration is sent
6457  * down from userland, instead of grabbed from the configuration cache.  For the
6458  * case of an open, the pool configuration will exist in the
6459  * POOL_STATE_UNINITIALIZED state.
6460  *
6461  * The stats information (gen/count/ustats) is used to gather vdev statistics at
6462  * the same time open the pool, without having to keep around the spa_t in some
6463  * ambiguous state.
6464  */
6465 static int
6466 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
6467     nvlist_t *nvpolicy, nvlist_t **config)
6468 {
6469 	spa_t *spa;
6470 	spa_load_state_t state = SPA_LOAD_OPEN;
6471 	int error;
6472 	int locked = B_FALSE;
6473 	int firstopen = B_FALSE;
6474 
6475 	*spapp = NULL;
6476 
6477 	/*
6478 	 * As disgusting as this is, we need to support recursive calls to this
6479 	 * function because dsl_dir_open() is called during spa_load(), and ends
6480 	 * up calling spa_open() again.  The real fix is to figure out how to
6481 	 * avoid dsl_dir_open() calling this in the first place.
6482 	 */
6483 	if (!spa_namespace_held()) {
6484 		spa_namespace_enter(FTAG);
6485 		locked = B_TRUE;
6486 	}
6487 
6488 	if ((spa = spa_lookup(pool)) == NULL) {
6489 		if (locked)
6490 			spa_namespace_exit(FTAG);
6491 		return (SET_ERROR(ENOENT));
6492 	}
6493 
6494 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
6495 		zpool_load_policy_t policy;
6496 
6497 		firstopen = B_TRUE;
6498 
6499 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
6500 		    &policy);
6501 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
6502 			state = SPA_LOAD_RECOVER;
6503 
6504 		spa_activate(spa, spa_mode_global);
6505 
6506 		if (state != SPA_LOAD_RECOVER)
6507 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6508 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
6509 
6510 		zfs_dbgmsg("spa_open_common: opening %s", pool);
6511 		error = spa_load_best(spa, state, policy.zlp_txg,
6512 		    policy.zlp_rewind);
6513 
6514 		if (error == EBADF) {
6515 			/*
6516 			 * If vdev_validate() returns failure (indicated by
6517 			 * EBADF), it indicates that one of the vdevs indicates
6518 			 * that the pool has been exported or destroyed.  If
6519 			 * this is the case, the config cache is out of sync and
6520 			 * we should remove the pool from the namespace.
6521 			 */
6522 			spa_unload(spa);
6523 			spa_deactivate(spa);
6524 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
6525 			spa_remove(spa);
6526 			if (locked)
6527 				spa_namespace_exit(FTAG);
6528 			return (SET_ERROR(ENOENT));
6529 		}
6530 
6531 		if (error) {
6532 			/*
6533 			 * We can't open the pool, but we still have useful
6534 			 * information: the state of each vdev after the
6535 			 * attempted vdev_open().  Return this to the user.
6536 			 */
6537 			if (config != NULL && spa->spa_config) {
6538 				*config = fnvlist_dup(spa->spa_config);
6539 				fnvlist_add_nvlist(*config,
6540 				    ZPOOL_CONFIG_LOAD_INFO,
6541 				    spa->spa_load_info);
6542 			}
6543 			spa_unload(spa);
6544 			spa_deactivate(spa);
6545 			spa->spa_last_open_failed = error;
6546 			if (locked)
6547 				spa_namespace_exit(FTAG);
6548 			*spapp = NULL;
6549 			return (error);
6550 		}
6551 	}
6552 
6553 	spa_open_ref(spa, tag);
6554 
6555 	if (config != NULL)
6556 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
6557 
6558 	/*
6559 	 * If we've recovered the pool, pass back any information we
6560 	 * gathered while doing the load.
6561 	 */
6562 	if (state == SPA_LOAD_RECOVER && config != NULL) {
6563 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
6564 		    spa->spa_load_info);
6565 	}
6566 
6567 	if (locked) {
6568 		spa->spa_last_open_failed = 0;
6569 		spa->spa_last_ubsync_txg = 0;
6570 		spa->spa_load_txg = 0;
6571 		spa_namespace_exit(FTAG);
6572 	}
6573 
6574 	if (firstopen)
6575 		zvol_create_minors(spa_name(spa));
6576 
6577 	*spapp = spa;
6578 
6579 	return (0);
6580 }
6581 
6582 int
6583 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
6584     nvlist_t *policy, nvlist_t **config)
6585 {
6586 	return (spa_open_common(name, spapp, tag, policy, config));
6587 }
6588 
6589 int
6590 spa_open(const char *name, spa_t **spapp, const void *tag)
6591 {
6592 	return (spa_open_common(name, spapp, tag, NULL, NULL));
6593 }
6594 
6595 /*
6596  * Lookup the given spa_t, incrementing the inject count in the process,
6597  * preventing it from being exported or destroyed.
6598  */
6599 spa_t *
6600 spa_inject_addref(char *name)
6601 {
6602 	spa_t *spa;
6603 
6604 	spa_namespace_enter(FTAG);
6605 	if ((spa = spa_lookup(name)) == NULL) {
6606 		spa_namespace_exit(FTAG);
6607 		return (NULL);
6608 	}
6609 	spa->spa_inject_ref++;
6610 	spa_namespace_exit(FTAG);
6611 
6612 	return (spa);
6613 }
6614 
6615 void
6616 spa_inject_delref(spa_t *spa)
6617 {
6618 	spa_namespace_enter(FTAG);
6619 	spa->spa_inject_ref--;
6620 	spa_namespace_exit(FTAG);
6621 }
6622 
6623 /*
6624  * Add spares device information to the nvlist.
6625  */
6626 static void
6627 spa_add_spares(spa_t *spa, nvlist_t *config)
6628 {
6629 	nvlist_t **spares;
6630 	uint_t i, nspares;
6631 	nvlist_t *nvroot;
6632 	uint64_t guid;
6633 	vdev_stat_t *vs;
6634 	uint_t vsc;
6635 	uint64_t pool;
6636 
6637 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
6638 
6639 	if (spa->spa_spares.sav_count == 0)
6640 		return;
6641 
6642 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
6643 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
6644 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
6645 	if (nspares != 0) {
6646 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6647 		    (const nvlist_t * const *)spares, nspares);
6648 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6649 		    &spares, &nspares));
6650 
6651 		/*
6652 		 * Go through and find any spares which have since been
6653 		 * repurposed as an active spare.  If this is the case, update
6654 		 * their status appropriately.
6655 		 */
6656 		for (i = 0; i < nspares; i++) {
6657 			guid = fnvlist_lookup_uint64(spares[i],
6658 			    ZPOOL_CONFIG_GUID);
6659 			VERIFY0(nvlist_lookup_uint64_array(spares[i],
6660 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
6661 			if (spa_spare_exists(guid, &pool, NULL) &&
6662 			    pool != 0ULL) {
6663 				vs->vs_state = VDEV_STATE_CANT_OPEN;
6664 				vs->vs_aux = VDEV_AUX_SPARED;
6665 			} else {
6666 				vs->vs_state =
6667 				    spa->spa_spares.sav_vdevs[i]->vdev_state;
6668 			}
6669 		}
6670 	}
6671 }
6672 
6673 /*
6674  * Add l2cache device information to the nvlist, including vdev stats.
6675  */
6676 static void
6677 spa_add_l2cache(spa_t *spa, nvlist_t *config)
6678 {
6679 	nvlist_t **l2cache;
6680 	uint_t i, j, nl2cache;
6681 	nvlist_t *nvroot;
6682 	uint64_t guid;
6683 	vdev_t *vd;
6684 	vdev_stat_t *vs;
6685 	uint_t vsc;
6686 
6687 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
6688 
6689 	if (spa->spa_l2cache.sav_count == 0)
6690 		return;
6691 
6692 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
6693 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
6694 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
6695 	if (nl2cache != 0) {
6696 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6697 		    (const nvlist_t * const *)l2cache, nl2cache);
6698 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6699 		    &l2cache, &nl2cache));
6700 
6701 		/*
6702 		 * Update level 2 cache device stats.
6703 		 */
6704 
6705 		for (i = 0; i < nl2cache; i++) {
6706 			guid = fnvlist_lookup_uint64(l2cache[i],
6707 			    ZPOOL_CONFIG_GUID);
6708 
6709 			vd = NULL;
6710 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
6711 				if (guid ==
6712 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
6713 					vd = spa->spa_l2cache.sav_vdevs[j];
6714 					break;
6715 				}
6716 			}
6717 			ASSERT(vd != NULL);
6718 
6719 			VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
6720 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
6721 			vdev_get_stats(vd, vs);
6722 			vdev_config_generate_stats(vd, l2cache[i]);
6723 
6724 		}
6725 	}
6726 }
6727 
6728 static void
6729 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
6730 {
6731 	zap_cursor_t zc;
6732 	zap_attribute_t *za = zap_attribute_alloc();
6733 
6734 	if (spa->spa_feat_for_read_obj != 0) {
6735 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
6736 		    spa->spa_feat_for_read_obj);
6737 		    zap_cursor_retrieve(&zc, za) == 0;
6738 		    zap_cursor_advance(&zc)) {
6739 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
6740 			    za->za_num_integers == 1);
6741 			VERIFY0(nvlist_add_uint64(features, za->za_name,
6742 			    za->za_first_integer));
6743 		}
6744 		zap_cursor_fini(&zc);
6745 	}
6746 
6747 	if (spa->spa_feat_for_write_obj != 0) {
6748 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
6749 		    spa->spa_feat_for_write_obj);
6750 		    zap_cursor_retrieve(&zc, za) == 0;
6751 		    zap_cursor_advance(&zc)) {
6752 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
6753 			    za->za_num_integers == 1);
6754 			VERIFY0(nvlist_add_uint64(features, za->za_name,
6755 			    za->za_first_integer));
6756 		}
6757 		zap_cursor_fini(&zc);
6758 	}
6759 	zap_attribute_free(za);
6760 }
6761 
6762 static void
6763 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
6764 {
6765 	int i;
6766 
6767 	for (i = 0; i < SPA_FEATURES; i++) {
6768 		zfeature_info_t feature = spa_feature_table[i];
6769 		uint64_t refcount;
6770 
6771 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
6772 			continue;
6773 
6774 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
6775 	}
6776 }
6777 
6778 /*
6779  * Store a list of pool features and their reference counts in the
6780  * config.
6781  *
6782  * The first time this is called on a spa, allocate a new nvlist, fetch
6783  * the pool features and reference counts from disk, then save the list
6784  * in the spa. In subsequent calls on the same spa use the saved nvlist
6785  * and refresh its values from the cached reference counts.  This
6786  * ensures we don't block here on I/O on a suspended pool so 'zpool
6787  * clear' can resume the pool.
6788  */
6789 static void
6790 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
6791 {
6792 	nvlist_t *features;
6793 
6794 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
6795 
6796 	mutex_enter(&spa->spa_feat_stats_lock);
6797 	features = spa->spa_feat_stats;
6798 
6799 	if (features != NULL) {
6800 		spa_feature_stats_from_cache(spa, features);
6801 	} else {
6802 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
6803 		spa->spa_feat_stats = features;
6804 		spa_feature_stats_from_disk(spa, features);
6805 	}
6806 
6807 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
6808 	    features));
6809 
6810 	mutex_exit(&spa->spa_feat_stats_lock);
6811 }
6812 
6813 int
6814 spa_get_stats(const char *name, nvlist_t **config,
6815     char *altroot, size_t buflen)
6816 {
6817 	int error;
6818 	spa_t *spa;
6819 
6820 	*config = NULL;
6821 	error = spa_open_common(name, &spa, FTAG, NULL, config);
6822 
6823 	if (spa != NULL) {
6824 		/*
6825 		 * This still leaves a window of inconsistency where the spares
6826 		 * or l2cache devices could change and the config would be
6827 		 * self-inconsistent.
6828 		 */
6829 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6830 
6831 		if (*config != NULL) {
6832 			uint64_t loadtimes[2];
6833 
6834 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
6835 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
6836 			fnvlist_add_uint64_array(*config,
6837 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
6838 
6839 			fnvlist_add_uint64(*config,
6840 			    ZPOOL_CONFIG_ERRCOUNT,
6841 			    spa_approx_errlog_size(spa));
6842 
6843 			if (spa_suspended(spa)) {
6844 				fnvlist_add_uint64(*config,
6845 				    ZPOOL_CONFIG_SUSPENDED,
6846 				    spa->spa_failmode);
6847 				fnvlist_add_uint64(*config,
6848 				    ZPOOL_CONFIG_SUSPENDED_REASON,
6849 				    spa->spa_suspended);
6850 			}
6851 
6852 			spa_add_spares(spa, *config);
6853 			spa_add_l2cache(spa, *config);
6854 			spa_add_feature_stats(spa, *config);
6855 		}
6856 	}
6857 
6858 	/*
6859 	 * We want to get the alternate root even for faulted pools, so we cheat
6860 	 * and call spa_lookup() directly.
6861 	 */
6862 	if (altroot) {
6863 		if (spa == NULL) {
6864 			spa_namespace_enter(FTAG);
6865 			spa = spa_lookup(name);
6866 			if (spa)
6867 				spa_altroot(spa, altroot, buflen);
6868 			else
6869 				altroot[0] = '\0';
6870 			spa = NULL;
6871 			spa_namespace_exit(FTAG);
6872 		} else {
6873 			spa_altroot(spa, altroot, buflen);
6874 		}
6875 	}
6876 
6877 	if (spa != NULL) {
6878 		spa_config_exit(spa, SCL_CONFIG, FTAG);
6879 		spa_close(spa, FTAG);
6880 	}
6881 
6882 	return (error);
6883 }
6884 
6885 /*
6886  * Validate that the auxiliary device array is well formed.  We must have an
6887  * array of nvlists, each which describes a valid leaf vdev.  If this is an
6888  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
6889  * specified, as long as they are well-formed.
6890  */
6891 static int
6892 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
6893     spa_aux_vdev_t *sav, const char *config, uint64_t version,
6894     vdev_labeltype_t label)
6895 {
6896 	nvlist_t **dev;
6897 	uint_t i, ndev;
6898 	vdev_t *vd;
6899 	int error;
6900 
6901 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
6902 
6903 	/*
6904 	 * It's acceptable to have no devs specified.
6905 	 */
6906 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
6907 		return (0);
6908 
6909 	if (ndev == 0)
6910 		return (SET_ERROR(EINVAL));
6911 
6912 	/*
6913 	 * Make sure the pool is formatted with a version that supports this
6914 	 * device type.
6915 	 */
6916 	if (spa_version(spa) < version)
6917 		return (SET_ERROR(ENOTSUP));
6918 
6919 	/*
6920 	 * Set the pending device list so we correctly handle device in-use
6921 	 * checking.
6922 	 */
6923 	sav->sav_pending = dev;
6924 	sav->sav_npending = ndev;
6925 
6926 	for (i = 0; i < ndev; i++) {
6927 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
6928 		    mode)) != 0)
6929 			goto out;
6930 
6931 		if (!vd->vdev_ops->vdev_op_leaf) {
6932 			vdev_free(vd);
6933 			error = SET_ERROR(EINVAL);
6934 			goto out;
6935 		}
6936 
6937 		vd->vdev_top = vd;
6938 
6939 		if ((error = vdev_open(vd)) == 0 &&
6940 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
6941 			fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
6942 			    vd->vdev_guid);
6943 		}
6944 
6945 		vdev_free(vd);
6946 
6947 		if (error &&
6948 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
6949 			goto out;
6950 		else
6951 			error = 0;
6952 	}
6953 
6954 out:
6955 	sav->sav_pending = NULL;
6956 	sav->sav_npending = 0;
6957 	return (error);
6958 }
6959 
6960 static int
6961 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
6962 {
6963 	int error;
6964 
6965 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
6966 
6967 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
6968 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
6969 	    VDEV_LABEL_SPARE)) != 0) {
6970 		return (error);
6971 	}
6972 
6973 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
6974 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
6975 	    VDEV_LABEL_L2CACHE));
6976 }
6977 
6978 static void
6979 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
6980     const char *config)
6981 {
6982 	int i;
6983 
6984 	if (sav->sav_config != NULL) {
6985 		nvlist_t **olddevs;
6986 		uint_t oldndevs;
6987 		nvlist_t **newdevs;
6988 
6989 		/*
6990 		 * Generate new dev list by concatenating with the
6991 		 * current dev list.
6992 		 */
6993 		VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
6994 		    &olddevs, &oldndevs));
6995 
6996 		newdevs = kmem_alloc(sizeof (void *) *
6997 		    (ndevs + oldndevs), KM_SLEEP);
6998 		for (i = 0; i < oldndevs; i++)
6999 			newdevs[i] = fnvlist_dup(olddevs[i]);
7000 		for (i = 0; i < ndevs; i++)
7001 			newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
7002 
7003 		fnvlist_remove(sav->sav_config, config);
7004 
7005 		fnvlist_add_nvlist_array(sav->sav_config, config,
7006 		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
7007 		for (i = 0; i < oldndevs + ndevs; i++)
7008 			nvlist_free(newdevs[i]);
7009 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
7010 	} else {
7011 		/*
7012 		 * Generate a new dev list.
7013 		 */
7014 		sav->sav_config = fnvlist_alloc();
7015 		fnvlist_add_nvlist_array(sav->sav_config, config,
7016 		    (const nvlist_t * const *)devs, ndevs);
7017 	}
7018 }
7019 
7020 /*
7021  * Stop and drop level 2 ARC devices
7022  */
7023 void
7024 spa_l2cache_drop(spa_t *spa)
7025 {
7026 	vdev_t *vd;
7027 	int i;
7028 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
7029 
7030 	for (i = 0; i < sav->sav_count; i++) {
7031 		uint64_t pool;
7032 
7033 		vd = sav->sav_vdevs[i];
7034 		ASSERT(vd != NULL);
7035 
7036 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
7037 		    pool != 0ULL && l2arc_vdev_present(vd))
7038 			l2arc_remove_vdev(vd);
7039 	}
7040 }
7041 
7042 /*
7043  * Verify encryption parameters for spa creation. If we are encrypting, we must
7044  * have the encryption feature flag enabled.
7045  */
7046 static int
7047 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
7048     boolean_t has_encryption)
7049 {
7050 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
7051 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
7052 	    !has_encryption)
7053 		return (SET_ERROR(ENOTSUP));
7054 
7055 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
7056 }
7057 
7058 /*
7059  * Pool Creation
7060  */
7061 int
7062 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
7063     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
7064 {
7065 	spa_t *spa;
7066 	const char *altroot = NULL;
7067 	vdev_t *rvd;
7068 	dsl_pool_t *dp;
7069 	dmu_tx_t *tx;
7070 	int error = 0;
7071 	uint64_t txg = TXG_INITIAL;
7072 	nvlist_t **spares, **l2cache;
7073 	uint_t nspares, nl2cache;
7074 	uint64_t version, obj, ndraid = 0;
7075 	boolean_t has_features;
7076 	boolean_t has_encryption;
7077 	boolean_t has_allocclass;
7078 	spa_feature_t feat;
7079 	const char *feat_name;
7080 	const char *poolname;
7081 	nvlist_t *nvl;
7082 
7083 	if (props == NULL ||
7084 	    nvlist_lookup_string(props,
7085 	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
7086 		poolname = (char *)pool;
7087 
7088 	/*
7089 	 * If this pool already exists, return failure.
7090 	 */
7091 	spa_namespace_enter(FTAG);
7092 	if (spa_lookup(poolname) != NULL) {
7093 		spa_namespace_exit(FTAG);
7094 		return (SET_ERROR(EEXIST));
7095 	}
7096 
7097 	/*
7098 	 * Allocate a new spa_t structure.
7099 	 */
7100 	nvl = fnvlist_alloc();
7101 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
7102 	(void) nvlist_lookup_string(props,
7103 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
7104 	spa = spa_add(poolname, nvl, altroot);
7105 	fnvlist_free(nvl);
7106 	spa_activate(spa, spa_mode_global);
7107 
7108 	if (props && (error = spa_prop_validate(spa, props))) {
7109 		spa_deactivate(spa);
7110 		spa_remove(spa);
7111 		spa_namespace_exit(FTAG);
7112 		return (error);
7113 	}
7114 
7115 	/*
7116 	 * Temporary pool names should never be written to disk.
7117 	 */
7118 	if (poolname != pool)
7119 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
7120 
7121 	has_features = B_FALSE;
7122 	has_encryption = B_FALSE;
7123 	has_allocclass = B_FALSE;
7124 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
7125 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
7126 		if (zpool_prop_feature(nvpair_name(elem))) {
7127 			has_features = B_TRUE;
7128 
7129 			feat_name = strchr(nvpair_name(elem), '@') + 1;
7130 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
7131 			if (feat == SPA_FEATURE_ENCRYPTION)
7132 				has_encryption = B_TRUE;
7133 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
7134 				has_allocclass = B_TRUE;
7135 		}
7136 	}
7137 
7138 	/* verify encryption params, if they were provided */
7139 	if (dcp != NULL) {
7140 		error = spa_create_check_encryption_params(dcp, has_encryption);
7141 		if (error != 0) {
7142 			spa_deactivate(spa);
7143 			spa_remove(spa);
7144 			spa_namespace_exit(FTAG);
7145 			return (error);
7146 		}
7147 	}
7148 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
7149 		spa_deactivate(spa);
7150 		spa_remove(spa);
7151 		spa_namespace_exit(FTAG);
7152 		return (ENOTSUP);
7153 	}
7154 
7155 	if (has_features || nvlist_lookup_uint64(props,
7156 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
7157 		version = SPA_VERSION;
7158 	}
7159 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
7160 
7161 	spa->spa_first_txg = txg;
7162 	spa->spa_uberblock.ub_txg = txg - 1;
7163 	spa->spa_uberblock.ub_version = version;
7164 	spa->spa_ubsync = spa->spa_uberblock;
7165 	spa->spa_load_state = SPA_LOAD_CREATE;
7166 	spa->spa_removing_phys.sr_state = DSS_NONE;
7167 	spa->spa_removing_phys.sr_removing_vdev = -1;
7168 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
7169 	spa->spa_indirect_vdevs_loaded = B_TRUE;
7170 	spa->spa_deflate = (version >= SPA_VERSION_RAIDZ_DEFLATE);
7171 
7172 	/*
7173 	 * Create "The Godfather" zio to hold all async IOs
7174 	 */
7175 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
7176 	    KM_SLEEP);
7177 	for (int i = 0; i < max_ncpus; i++) {
7178 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
7179 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
7180 		    ZIO_FLAG_GODFATHER);
7181 	}
7182 
7183 	/*
7184 	 * Create the root vdev.
7185 	 */
7186 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7187 
7188 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
7189 
7190 	ASSERT(error != 0 || rvd != NULL);
7191 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
7192 
7193 	if (error == 0 && !zfs_allocatable_devs(nvroot))
7194 		error = SET_ERROR(EINVAL);
7195 
7196 	if (error == 0 &&
7197 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
7198 	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
7199 	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
7200 		/*
7201 		 * instantiate the metaslab groups (this will dirty the vdevs)
7202 		 * we can no longer error exit past this point
7203 		 */
7204 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
7205 			vdev_t *vd = rvd->vdev_child[c];
7206 
7207 			vdev_metaslab_set_size(vd);
7208 			vdev_expand(vd, txg);
7209 		}
7210 	}
7211 
7212 	spa_config_exit(spa, SCL_ALL, FTAG);
7213 
7214 	if (error != 0) {
7215 		spa_unload(spa);
7216 		spa_deactivate(spa);
7217 		spa_remove(spa);
7218 		spa_namespace_exit(FTAG);
7219 		return (error);
7220 	}
7221 
7222 	/*
7223 	 * Get the list of spares, if specified.
7224 	 */
7225 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
7226 	    &spares, &nspares) == 0) {
7227 		spa->spa_spares.sav_config = fnvlist_alloc();
7228 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
7229 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
7230 		    nspares);
7231 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7232 		spa_load_spares(spa);
7233 		spa_config_exit(spa, SCL_ALL, FTAG);
7234 		spa->spa_spares.sav_sync = B_TRUE;
7235 	}
7236 
7237 	/*
7238 	 * Get the list of level 2 cache devices, if specified.
7239 	 */
7240 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
7241 	    &l2cache, &nl2cache) == 0) {
7242 		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
7243 		    NV_UNIQUE_NAME, KM_SLEEP));
7244 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
7245 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
7246 		    nl2cache);
7247 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7248 		spa_load_l2cache(spa);
7249 		spa_config_exit(spa, SCL_ALL, FTAG);
7250 		spa->spa_l2cache.sav_sync = B_TRUE;
7251 	}
7252 
7253 	spa->spa_is_initializing = B_TRUE;
7254 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
7255 	spa->spa_is_initializing = B_FALSE;
7256 
7257 	/*
7258 	 * Create DDTs (dedup tables).
7259 	 */
7260 	ddt_create(spa);
7261 	/*
7262 	 * Create BRT table and BRT table object.
7263 	 */
7264 	brt_create(spa);
7265 
7266 	spa_update_dspace(spa);
7267 
7268 	tx = dmu_tx_create_assigned(dp, txg);
7269 
7270 	/*
7271 	 * Create the pool's history object.
7272 	 */
7273 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
7274 		spa_history_create_obj(spa, tx);
7275 
7276 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
7277 	spa_history_log_version(spa, "create", tx);
7278 
7279 	/*
7280 	 * Create the pool config object.
7281 	 */
7282 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
7283 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
7284 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
7285 
7286 	if (zap_add(spa->spa_meta_objset,
7287 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
7288 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
7289 		cmn_err(CE_PANIC, "failed to add pool config");
7290 	}
7291 
7292 	if (zap_add(spa->spa_meta_objset,
7293 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
7294 	    sizeof (uint64_t), 1, &version, tx) != 0) {
7295 		cmn_err(CE_PANIC, "failed to add pool version");
7296 	}
7297 
7298 	/* Newly created pools with the right version are always deflated. */
7299 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
7300 		if (zap_add(spa->spa_meta_objset,
7301 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
7302 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
7303 			cmn_err(CE_PANIC, "failed to add deflate");
7304 		}
7305 	}
7306 
7307 	/*
7308 	 * Create the deferred-free bpobj.  Turn off compression
7309 	 * because sync-to-convergence takes longer if the blocksize
7310 	 * keeps changing.
7311 	 */
7312 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
7313 	dmu_object_set_compress(spa->spa_meta_objset, obj,
7314 	    ZIO_COMPRESS_OFF, tx);
7315 	if (zap_add(spa->spa_meta_objset,
7316 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
7317 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
7318 		cmn_err(CE_PANIC, "failed to add bpobj");
7319 	}
7320 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
7321 	    spa->spa_meta_objset, obj));
7322 
7323 	/*
7324 	 * Generate some random noise for salted checksums to operate on.
7325 	 */
7326 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
7327 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
7328 
7329 	/*
7330 	 * Set pool properties.
7331 	 */
7332 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
7333 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
7334 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
7335 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
7336 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
7337 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
7338 	spa->spa_dedup_table_quota =
7339 	    zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
7340 
7341 	if (props != NULL) {
7342 		spa_configfile_set(spa, props, B_FALSE);
7343 		spa_sync_props(props, tx);
7344 	}
7345 
7346 	for (int i = 0; i < ndraid; i++)
7347 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
7348 
7349 	dmu_tx_commit(tx);
7350 
7351 	spa->spa_sync_on = B_TRUE;
7352 	txg_sync_start(dp);
7353 	mmp_thread_start(spa);
7354 	txg_wait_synced(dp, txg);
7355 
7356 	spa_spawn_aux_threads(spa);
7357 
7358 	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
7359 
7360 	/*
7361 	 * Don't count references from objsets that are already closed
7362 	 * and are making their way through the eviction process.
7363 	 */
7364 	spa_evicting_os_wait(spa);
7365 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
7366 	spa->spa_load_state = SPA_LOAD_NONE;
7367 
7368 	spa_import_os(spa);
7369 
7370 	spa_namespace_exit(FTAG);
7371 
7372 	return (0);
7373 }
7374 
7375 /*
7376  * Import a non-root pool into the system.
7377  */
7378 int
7379 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
7380 {
7381 	spa_t *spa;
7382 	const char *altroot = NULL;
7383 	spa_load_state_t state = SPA_LOAD_IMPORT;
7384 	zpool_load_policy_t policy;
7385 	spa_mode_t mode = spa_mode_global;
7386 	uint64_t readonly = B_FALSE;
7387 	int error;
7388 	nvlist_t *nvroot;
7389 	nvlist_t **spares, **l2cache;
7390 	uint_t nspares, nl2cache;
7391 
7392 	/*
7393 	 * If a pool with this name exists, return failure.
7394 	 */
7395 	spa_namespace_enter(FTAG);
7396 	if (spa_lookup(pool) != NULL) {
7397 		spa_namespace_exit(FTAG);
7398 		return (SET_ERROR(EEXIST));
7399 	}
7400 
7401 	/*
7402 	 * Create and initialize the spa structure.
7403 	 */
7404 	(void) nvlist_lookup_string(props,
7405 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
7406 	(void) nvlist_lookup_uint64(props,
7407 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
7408 	if (readonly)
7409 		mode = SPA_MODE_READ;
7410 	spa = spa_add(pool, config, altroot);
7411 	spa->spa_import_flags = flags;
7412 
7413 	/*
7414 	 * Verbatim import - Take a pool and insert it into the namespace
7415 	 * as if it had been loaded at boot.
7416 	 */
7417 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
7418 		if (props != NULL)
7419 			spa_configfile_set(spa, props, B_FALSE);
7420 
7421 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
7422 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
7423 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
7424 		spa_namespace_exit(FTAG);
7425 		return (0);
7426 	}
7427 
7428 	spa_activate(spa, mode);
7429 
7430 	/*
7431 	 * Don't start async tasks until we know everything is healthy.
7432 	 */
7433 	spa_async_suspend(spa);
7434 
7435 	zpool_get_load_policy(config, &policy);
7436 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
7437 		state = SPA_LOAD_RECOVER;
7438 
7439 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
7440 
7441 	if (state != SPA_LOAD_RECOVER) {
7442 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
7443 		zfs_dbgmsg("spa_import: importing %s", pool);
7444 	} else {
7445 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
7446 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
7447 	}
7448 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
7449 
7450 	/*
7451 	 * Propagate anything learned while loading the pool and pass it
7452 	 * back to caller (i.e. rewind info, missing devices, etc).
7453 	 */
7454 	fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
7455 
7456 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7457 	/*
7458 	 * Toss any existing sparelist, as it doesn't have any validity
7459 	 * anymore, and conflicts with spa_has_spare().
7460 	 */
7461 	if (spa->spa_spares.sav_config) {
7462 		nvlist_free(spa->spa_spares.sav_config);
7463 		spa->spa_spares.sav_config = NULL;
7464 		spa_load_spares(spa);
7465 	}
7466 	if (spa->spa_l2cache.sav_config) {
7467 		nvlist_free(spa->spa_l2cache.sav_config);
7468 		spa->spa_l2cache.sav_config = NULL;
7469 		spa_load_l2cache(spa);
7470 	}
7471 
7472 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
7473 	spa_config_exit(spa, SCL_ALL, FTAG);
7474 
7475 	if (props != NULL)
7476 		spa_configfile_set(spa, props, B_FALSE);
7477 
7478 	if (error != 0 || (props && spa_writeable(spa) &&
7479 	    (error = spa_prop_set(spa, props)))) {
7480 		spa_unload(spa);
7481 		spa_deactivate(spa);
7482 		spa_remove(spa);
7483 		spa_namespace_exit(FTAG);
7484 		return (error);
7485 	}
7486 
7487 	spa_async_resume(spa);
7488 
7489 	/*
7490 	 * Override any spares and level 2 cache devices as specified by
7491 	 * the user, as these may have correct device names/devids, etc.
7492 	 */
7493 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
7494 	    &spares, &nspares) == 0) {
7495 		if (spa->spa_spares.sav_config)
7496 			fnvlist_remove(spa->spa_spares.sav_config,
7497 			    ZPOOL_CONFIG_SPARES);
7498 		else
7499 			spa->spa_spares.sav_config = fnvlist_alloc();
7500 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
7501 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
7502 		    nspares);
7503 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7504 		spa_load_spares(spa);
7505 		spa_config_exit(spa, SCL_ALL, FTAG);
7506 		spa->spa_spares.sav_sync = B_TRUE;
7507 		spa->spa_spares.sav_label_sync = B_TRUE;
7508 	}
7509 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
7510 	    &l2cache, &nl2cache) == 0) {
7511 		if (spa->spa_l2cache.sav_config)
7512 			fnvlist_remove(spa->spa_l2cache.sav_config,
7513 			    ZPOOL_CONFIG_L2CACHE);
7514 		else
7515 			spa->spa_l2cache.sav_config = fnvlist_alloc();
7516 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
7517 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
7518 		    nl2cache);
7519 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7520 		spa_load_l2cache(spa);
7521 		spa_config_exit(spa, SCL_ALL, FTAG);
7522 		spa->spa_l2cache.sav_sync = B_TRUE;
7523 		spa->spa_l2cache.sav_label_sync = B_TRUE;
7524 	}
7525 
7526 	/*
7527 	 * Check for any removed devices.
7528 	 */
7529 	if (spa->spa_autoreplace) {
7530 		spa_aux_check_removed(&spa->spa_spares);
7531 		spa_aux_check_removed(&spa->spa_l2cache);
7532 	}
7533 
7534 	if (spa_writeable(spa)) {
7535 		/*
7536 		 * Update the config cache to include the newly-imported pool.
7537 		 */
7538 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
7539 	}
7540 
7541 	/*
7542 	 * It's possible that the pool was expanded while it was exported.
7543 	 * We kick off an async task to handle this for us.
7544 	 */
7545 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
7546 
7547 	spa_history_log_version(spa, "import", NULL);
7548 
7549 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
7550 
7551 	spa_namespace_exit(FTAG);
7552 
7553 	zvol_create_minors(pool);
7554 
7555 	spa_import_os(spa);
7556 
7557 	return (0);
7558 }
7559 
7560 nvlist_t *
7561 spa_tryimport(nvlist_t *tryconfig)
7562 {
7563 	nvlist_t *config = NULL;
7564 	const char *poolname, *cachefile;
7565 	spa_t *spa;
7566 	uint64_t state;
7567 	int error;
7568 	zpool_load_policy_t policy;
7569 
7570 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
7571 		return (NULL);
7572 
7573 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
7574 		return (NULL);
7575 
7576 	/*
7577 	 * Create and initialize the spa structure.
7578 	 */
7579 	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7580 	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
7581 	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
7582 
7583 	spa_namespace_enter(FTAG);
7584 	spa = spa_add(name, tryconfig, NULL);
7585 	spa_activate(spa, SPA_MODE_READ);
7586 	kmem_free(name, MAXPATHLEN);
7587 
7588 	spa->spa_load_name = spa_strdup(poolname);
7589 
7590 	/*
7591 	 * Rewind pool if a max txg was provided.
7592 	 */
7593 	zpool_get_load_policy(spa->spa_config, &policy);
7594 	if (policy.zlp_txg != UINT64_MAX) {
7595 		spa->spa_load_max_txg = policy.zlp_txg;
7596 		spa->spa_extreme_rewind = B_TRUE;
7597 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
7598 		    spa_load_name(spa), (longlong_t)policy.zlp_txg);
7599 	} else {
7600 		zfs_dbgmsg("spa_tryimport: importing %s", spa_load_name(spa));
7601 	}
7602 
7603 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
7604 	    == 0) {
7605 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
7606 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
7607 	} else {
7608 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
7609 	}
7610 
7611 	/*
7612 	 * spa_import() relies on a pool config fetched by spa_try_import()
7613 	 * for spare/cache devices. Import flags are not passed to
7614 	 * spa_tryimport(), which makes it return early due to a missing log
7615 	 * device and missing retrieving the cache device and spare eventually.
7616 	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
7617 	 * the correct configuration regardless of the missing log device.
7618 	 */
7619 	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
7620 
7621 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
7622 
7623 	/*
7624 	 * If 'tryconfig' was at least parsable, return the current config.
7625 	 */
7626 	if (spa->spa_root_vdev != NULL) {
7627 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
7628 		fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
7629 		    spa_load_name(spa));
7630 		fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
7631 		fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
7632 		    spa->spa_uberblock.ub_timestamp);
7633 		fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
7634 		    spa->spa_load_info);
7635 		fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
7636 		    spa->spa_errata);
7637 
7638 		/*
7639 		 * If the bootfs property exists on this pool then we
7640 		 * copy it out so that external consumers can tell which
7641 		 * pools are bootable.
7642 		 */
7643 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
7644 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7645 
7646 			/*
7647 			 * We have to play games with the name since the
7648 			 * pool was opened as TRYIMPORT_NAME.
7649 			 */
7650 			if (dsl_dsobj_to_dsname(spa_name(spa),
7651 			    spa->spa_bootfs, tmpname) == 0) {
7652 				char *cp;
7653 				char *dsname;
7654 
7655 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
7656 
7657 				cp = strchr(tmpname, '/');
7658 				if (cp == NULL) {
7659 					(void) strlcpy(dsname, tmpname,
7660 					    MAXPATHLEN);
7661 				} else {
7662 					(void) snprintf(dsname, MAXPATHLEN,
7663 					    "%s/%s", spa_load_name(spa), ++cp);
7664 				}
7665 				fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
7666 				    dsname);
7667 				kmem_free(dsname, MAXPATHLEN);
7668 			}
7669 			kmem_free(tmpname, MAXPATHLEN);
7670 		}
7671 
7672 		/*
7673 		 * Add the list of hot spares and level 2 cache devices.
7674 		 */
7675 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7676 		spa_add_spares(spa, config);
7677 		spa_add_l2cache(spa, config);
7678 		spa_config_exit(spa, SCL_CONFIG, FTAG);
7679 	}
7680 
7681 	spa_unload(spa);
7682 	spa_deactivate(spa);
7683 	spa_remove(spa);
7684 	spa_namespace_exit(FTAG);
7685 
7686 	return (config);
7687 }
7688 
7689 /*
7690  * Pool export/destroy
7691  *
7692  * The act of destroying or exporting a pool is very simple.  We make sure there
7693  * is no more pending I/O and any references to the pool are gone.  Then, we
7694  * update the pool state and sync all the labels to disk, removing the
7695  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
7696  * we don't sync the labels or remove the configuration cache.
7697  */
7698 static int
7699 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
7700     boolean_t force, boolean_t hardforce)
7701 {
7702 	int error = 0;
7703 	spa_t *spa;
7704 	hrtime_t export_start = gethrtime();
7705 
7706 	if (oldconfig)
7707 		*oldconfig = NULL;
7708 
7709 	if (!(spa_mode_global & SPA_MODE_WRITE))
7710 		return (SET_ERROR(EROFS));
7711 
7712 	spa_namespace_enter(FTAG);
7713 	if ((spa = spa_lookup(pool)) == NULL) {
7714 		spa_namespace_exit(FTAG);
7715 		return (SET_ERROR(ENOENT));
7716 	}
7717 
7718 	if (spa->spa_is_exporting) {
7719 		/* the pool is being exported by another thread */
7720 		spa_namespace_exit(FTAG);
7721 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
7722 	}
7723 	spa->spa_is_exporting = B_TRUE;
7724 
7725 	/*
7726 	 * Put a hold on the pool, drop the namespace lock, stop async tasks
7727 	 * and see if we can export.
7728 	 */
7729 	spa_open_ref(spa, FTAG);
7730 	spa_namespace_exit(FTAG);
7731 	spa_async_suspend(spa);
7732 	if (spa->spa_zvol_taskq) {
7733 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
7734 		taskq_wait(spa->spa_zvol_taskq);
7735 	}
7736 	spa_namespace_enter(FTAG);
7737 	spa->spa_export_thread = curthread;
7738 	spa_close(spa, FTAG);
7739 
7740 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
7741 		spa_namespace_exit(FTAG);
7742 		goto export_spa;
7743 	}
7744 
7745 	/*
7746 	 * The pool will be in core if it's openable, in which case we can
7747 	 * modify its state.  Objsets may be open only because they're dirty,
7748 	 * so we have to force it to sync before checking spa_refcnt.
7749 	 */
7750 	if (spa->spa_sync_on) {
7751 		txg_wait_synced(spa->spa_dsl_pool, 0);
7752 		spa_evicting_os_wait(spa);
7753 	}
7754 
7755 	/*
7756 	 * A pool cannot be exported or destroyed if there are active
7757 	 * references.  If we are resetting a pool, allow references by
7758 	 * fault injection handlers.
7759 	 */
7760 	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
7761 		error = SET_ERROR(EBUSY);
7762 		goto fail;
7763 	}
7764 
7765 	spa_namespace_exit(FTAG);
7766 	/*
7767 	 * At this point we no longer hold the spa_namespace_lock and
7768 	 * there were no references on the spa. Future spa_lookups will
7769 	 * notice the spa->spa_export_thread and wait until we signal
7770 	 * that we are finshed.
7771 	 */
7772 
7773 	if (spa->spa_sync_on) {
7774 		vdev_t *rvd = spa->spa_root_vdev;
7775 		/*
7776 		 * A pool cannot be exported if it has an active shared spare.
7777 		 * This is to prevent other pools stealing the active spare
7778 		 * from an exported pool. At user's own will, such pool can
7779 		 * be forcedly exported.
7780 		 */
7781 		if (!force && new_state == POOL_STATE_EXPORTED &&
7782 		    spa_has_active_shared_spare(spa)) {
7783 			error = SET_ERROR(EXDEV);
7784 			spa_namespace_enter(FTAG);
7785 			goto fail;
7786 		}
7787 
7788 		/*
7789 		 * We're about to export or destroy this pool. Make sure
7790 		 * we stop all initialization and trim activity here before
7791 		 * we set the spa_final_txg. This will ensure that all
7792 		 * dirty data resulting from the initialization is
7793 		 * committed to disk before we unload the pool.
7794 		 */
7795 		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
7796 		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
7797 		vdev_autotrim_stop_all(spa);
7798 		vdev_rebuild_stop_all(spa);
7799 		l2arc_spa_rebuild_stop(spa);
7800 
7801 		/*
7802 		 * We want this to be reflected on every label,
7803 		 * so mark them all dirty.  spa_unload() will do the
7804 		 * final sync that pushes these changes out.
7805 		 */
7806 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
7807 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7808 			spa->spa_state = new_state;
7809 			vdev_config_dirty(rvd);
7810 			spa_config_exit(spa, SCL_ALL, FTAG);
7811 		}
7812 
7813 		if (spa_should_sync_time_logger_on_unload(spa))
7814 			spa_unload_sync_time_logger(spa);
7815 
7816 		/*
7817 		 * If the log space map feature is enabled and the pool is
7818 		 * getting exported (but not destroyed), we want to spend some
7819 		 * time flushing as many metaslabs as we can in an attempt to
7820 		 * destroy log space maps and save import time. This has to be
7821 		 * done before we set the spa_final_txg, otherwise
7822 		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
7823 		 * spa_should_flush_logs_on_unload() should be called after
7824 		 * spa_state has been set to the new_state.
7825 		 */
7826 		if (spa_should_flush_logs_on_unload(spa))
7827 			spa_unload_log_sm_flush_all(spa);
7828 
7829 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
7830 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7831 			spa->spa_final_txg = spa_last_synced_txg(spa) +
7832 			    TXG_DEFER_SIZE + 1;
7833 			spa_config_exit(spa, SCL_ALL, FTAG);
7834 		}
7835 	}
7836 
7837 export_spa:
7838 	spa_export_os(spa);
7839 
7840 	if (new_state == POOL_STATE_DESTROYED)
7841 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
7842 	else if (new_state == POOL_STATE_EXPORTED)
7843 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
7844 
7845 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
7846 		spa_unload(spa);
7847 		spa_deactivate(spa);
7848 	}
7849 
7850 	if (oldconfig && spa->spa_config)
7851 		*oldconfig = fnvlist_dup(spa->spa_config);
7852 
7853 	if (new_state == POOL_STATE_EXPORTED)
7854 		zio_handle_export_delay(spa, gethrtime() - export_start);
7855 
7856 	/*
7857 	 * Take the namespace lock for the actual spa_t removal
7858 	 */
7859 	spa_namespace_enter(FTAG);
7860 	if (new_state != POOL_STATE_UNINITIALIZED) {
7861 		if (!hardforce)
7862 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
7863 		spa_remove(spa);
7864 	} else {
7865 		/*
7866 		 * If spa_remove() is not called for this spa_t and
7867 		 * there is any possibility that it can be reused,
7868 		 * we make sure to reset the exporting flag.
7869 		 */
7870 		spa->spa_is_exporting = B_FALSE;
7871 		spa->spa_export_thread = NULL;
7872 	}
7873 
7874 	/*
7875 	 * Wake up any waiters in spa_lookup()
7876 	 */
7877 	spa_namespace_broadcast();
7878 	spa_namespace_exit(FTAG);
7879 	return (0);
7880 
7881 fail:
7882 	spa->spa_is_exporting = B_FALSE;
7883 	spa->spa_export_thread = NULL;
7884 
7885 	spa_async_resume(spa);
7886 	/*
7887 	 * Wake up any waiters in spa_lookup()
7888 	 */
7889 	spa_namespace_broadcast();
7890 	spa_namespace_exit(FTAG);
7891 	return (error);
7892 }
7893 
7894 /*
7895  * Destroy a storage pool.
7896  */
7897 int
7898 spa_destroy(const char *pool)
7899 {
7900 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
7901 	    B_FALSE, B_FALSE));
7902 }
7903 
7904 /*
7905  * Export a storage pool.
7906  */
7907 int
7908 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
7909     boolean_t hardforce)
7910 {
7911 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
7912 	    force, hardforce));
7913 }
7914 
7915 /*
7916  * Similar to spa_export(), this unloads the spa_t without actually removing it
7917  * from the namespace in any way.
7918  */
7919 int
7920 spa_reset(const char *pool)
7921 {
7922 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
7923 	    B_FALSE, B_FALSE));
7924 }
7925 
7926 /*
7927  * ==========================================================================
7928  * Device manipulation
7929  * ==========================================================================
7930  */
7931 
7932 /*
7933  * This is called as a synctask to increment the draid feature flag
7934  */
7935 static void
7936 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
7937 {
7938 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7939 	int draid = (int)(uintptr_t)arg;
7940 
7941 	for (int c = 0; c < draid; c++)
7942 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
7943 }
7944 
7945 /*
7946  * Add a device to a storage pool.
7947  */
7948 int
7949 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
7950 {
7951 	uint64_t txg, ndraid = 0;
7952 	int error;
7953 	vdev_t *rvd = spa->spa_root_vdev;
7954 	vdev_t *vd, *tvd;
7955 	nvlist_t **spares, **l2cache;
7956 	uint_t nspares, nl2cache;
7957 
7958 	ASSERT(spa_writeable(spa));
7959 
7960 	txg = spa_vdev_enter(spa);
7961 
7962 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
7963 	    VDEV_ALLOC_ADD)) != 0)
7964 		return (spa_vdev_exit(spa, NULL, txg, error));
7965 
7966 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
7967 
7968 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
7969 	    &nspares) != 0)
7970 		nspares = 0;
7971 
7972 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
7973 	    &nl2cache) != 0)
7974 		nl2cache = 0;
7975 
7976 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
7977 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
7978 
7979 	if (vd->vdev_children != 0 &&
7980 	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
7981 		return (spa_vdev_exit(spa, vd, txg, error));
7982 	}
7983 
7984 	/*
7985 	 * The virtual dRAID spares must be added after vdev tree is created
7986 	 * and the vdev guids are generated.  The guid of their associated
7987 	 * dRAID is stored in the config and used when opening the spare.
7988 	 */
7989 	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
7990 	    rvd->vdev_children)) == 0) {
7991 		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
7992 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
7993 			nspares = 0;
7994 	} else {
7995 		return (spa_vdev_exit(spa, vd, txg, error));
7996 	}
7997 
7998 	/*
7999 	 * We must validate the spares and l2cache devices after checking the
8000 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
8001 	 */
8002 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
8003 		return (spa_vdev_exit(spa, vd, txg, error));
8004 
8005 	/*
8006 	 * If we are in the middle of a device removal, we can only add
8007 	 * devices which match the existing devices in the pool.
8008 	 * If we are in the middle of a removal, or have some indirect
8009 	 * vdevs, we can not add raidz or dRAID top levels.
8010 	 */
8011 	if (spa->spa_vdev_removal != NULL ||
8012 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
8013 		for (int c = 0; c < vd->vdev_children; c++) {
8014 			tvd = vd->vdev_child[c];
8015 			if (spa->spa_vdev_removal != NULL &&
8016 			    tvd->vdev_ashift != spa->spa_max_ashift) {
8017 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
8018 			}
8019 			/* Fail if top level vdev is raidz or a dRAID */
8020 			if (vdev_get_nparity(tvd) != 0)
8021 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
8022 
8023 			/*
8024 			 * Need the top level mirror to be
8025 			 * a mirror of leaf vdevs only
8026 			 */
8027 			if (tvd->vdev_ops == &vdev_mirror_ops) {
8028 				for (uint64_t cid = 0;
8029 				    cid < tvd->vdev_children; cid++) {
8030 					vdev_t *cvd = tvd->vdev_child[cid];
8031 					if (!cvd->vdev_ops->vdev_op_leaf) {
8032 						return (spa_vdev_exit(spa, vd,
8033 						    txg, EINVAL));
8034 					}
8035 				}
8036 			}
8037 		}
8038 	}
8039 
8040 	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
8041 		for (int c = 0; c < vd->vdev_children; c++) {
8042 			tvd = vd->vdev_child[c];
8043 			if (tvd->vdev_ashift != spa->spa_max_ashift) {
8044 				return (spa_vdev_exit(spa, vd, txg,
8045 				    ZFS_ERR_ASHIFT_MISMATCH));
8046 			}
8047 		}
8048 	}
8049 
8050 	for (int c = 0; c < vd->vdev_children; c++) {
8051 		tvd = vd->vdev_child[c];
8052 		vdev_remove_child(vd, tvd);
8053 		tvd->vdev_id = rvd->vdev_children;
8054 		vdev_add_child(rvd, tvd);
8055 		vdev_config_dirty(tvd);
8056 	}
8057 
8058 	if (nspares != 0) {
8059 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
8060 		    ZPOOL_CONFIG_SPARES);
8061 		spa_load_spares(spa);
8062 		spa->spa_spares.sav_sync = B_TRUE;
8063 	}
8064 
8065 	if (nl2cache != 0) {
8066 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
8067 		    ZPOOL_CONFIG_L2CACHE);
8068 		spa_load_l2cache(spa);
8069 		spa->spa_l2cache.sav_sync = B_TRUE;
8070 	}
8071 
8072 	/*
8073 	 * We can't increment a feature while holding spa_vdev so we
8074 	 * have to do it in a synctask.
8075 	 */
8076 	if (ndraid != 0) {
8077 		dmu_tx_t *tx;
8078 
8079 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
8080 		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
8081 		    (void *)(uintptr_t)ndraid, tx);
8082 		dmu_tx_commit(tx);
8083 	}
8084 
8085 	/*
8086 	 * We have to be careful when adding new vdevs to an existing pool.
8087 	 * If other threads start allocating from these vdevs before we
8088 	 * sync the config cache, and we lose power, then upon reboot we may
8089 	 * fail to open the pool because there are DVAs that the config cache
8090 	 * can't translate.  Therefore, we first add the vdevs without
8091 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
8092 	 * and then let spa_config_update() initialize the new metaslabs.
8093 	 *
8094 	 * spa_load() checks for added-but-not-initialized vdevs, so that
8095 	 * if we lose power at any point in this sequence, the remaining
8096 	 * steps will be completed the next time we load the pool.
8097 	 */
8098 	(void) spa_vdev_exit(spa, vd, txg, 0);
8099 
8100 	spa_namespace_enter(FTAG);
8101 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
8102 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
8103 	spa_namespace_exit(FTAG);
8104 
8105 	return (0);
8106 }
8107 
8108 /*
8109  * Given a vdev to be replaced and its parent, check for a possible
8110  * "double spare" condition if a vdev is to be replaced by a spare.  When this
8111  * happens, you can get two spares assigned to one failed vdev.
8112  *
8113  * To trigger a double spare condition:
8114  *
8115  * 1. disk1 fails
8116  * 2. 1st spare is kicked in for disk1 and it resilvers
8117  * 3. Someone replaces disk1 with a new blank disk
8118  * 4. New blank disk starts resilvering
8119  * 5. While resilvering, new blank disk has IO errors and faults
8120  * 6. 2nd spare is kicked in for new blank disk
8121  * 7. At this point two spares are kicked in for the original disk1.
8122  *
8123  * It looks like this:
8124  *
8125  * NAME                                            STATE     READ WRITE CKSUM
8126  * tank2                                           DEGRADED     0     0     0
8127  *   draid2:6d:10c:2s-0                            DEGRADED     0     0     0
8128  *     scsi-0QEMU_QEMU_HARDDISK_d1                 ONLINE       0     0     0
8129  *     scsi-0QEMU_QEMU_HARDDISK_d2                 ONLINE       0     0     0
8130  *     scsi-0QEMU_QEMU_HARDDISK_d3                 ONLINE       0     0     0
8131  *     scsi-0QEMU_QEMU_HARDDISK_d4                 ONLINE       0     0     0
8132  *     scsi-0QEMU_QEMU_HARDDISK_d5                 ONLINE       0     0     0
8133  *     scsi-0QEMU_QEMU_HARDDISK_d6                 ONLINE       0     0     0
8134  *     scsi-0QEMU_QEMU_HARDDISK_d7                 ONLINE       0     0     0
8135  *     scsi-0QEMU_QEMU_HARDDISK_d8                 ONLINE       0     0     0
8136  *     scsi-0QEMU_QEMU_HARDDISK_d9                 ONLINE       0     0     0
8137  *     spare-9                                     DEGRADED     0     0     0
8138  *       replacing-0                               DEGRADED     0    93     0
8139  *         scsi-0QEMU_QEMU_HARDDISK_d10-part1/old  UNAVAIL      0     0     0
8140  *         spare-1                                 DEGRADED     0     0     0
8141  *           scsi-0QEMU_QEMU_HARDDISK_d10          REMOVED      0     0     0
8142  *           draid2-0-0                            ONLINE       0     0     0
8143  *       draid2-0-1                                ONLINE       0     0     0
8144  * spares
8145  *   draid2-0-0                                    INUSE     currently in use
8146  *   draid2-0-1                                    INUSE     currently in use
8147  *
8148  * ARGS:
8149  *
8150  * newvd:  New spare disk
8151  * pvd:    Parent vdev_t the spare should attach to
8152  *
8153  * This function returns B_TRUE if adding the new vdev would create a double
8154  * spare condition, B_FALSE otherwise.
8155  */
8156 static boolean_t
8157 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd)
8158 {
8159 	vdev_t *ppvd;
8160 
8161 	ppvd = pvd->vdev_parent;
8162 	if (ppvd == NULL)
8163 		return (B_FALSE);
8164 
8165 	/*
8166 	 * To determine if this configuration would cause a double spare, we
8167 	 * look at the vdev_op of the parent vdev, and of the parent's parent
8168 	 * vdev.  We also look at vdev_isspare on the new disk.  A double spare
8169 	 * condition looks like this:
8170 	 *
8171 	 * 1. parent of parent's op is a spare or draid spare
8172 	 * 2. parent's op is replacing
8173 	 * 3. new disk is a spare
8174 	 */
8175 	if ((ppvd->vdev_ops == &vdev_spare_ops) ||
8176 	    (ppvd->vdev_ops == &vdev_draid_spare_ops))
8177 		if (pvd->vdev_ops == &vdev_replacing_ops)
8178 			if (newvd->vdev_isspare)
8179 				return (B_TRUE);
8180 
8181 	return (B_FALSE);
8182 }
8183 
8184 /*
8185  * Attach a device to a vdev specified by its guid.  The vdev type can be
8186  * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
8187  * single device). When the vdev is a single device, a mirror vdev will be
8188  * automatically inserted.
8189  *
8190  * If 'replacing' is specified, the new device is intended to replace the
8191  * existing device; in this case the two devices are made into their own
8192  * mirror using the 'replacing' vdev, which is functionally identical to
8193  * the mirror vdev (it actually reuses all the same ops) but has a few
8194  * extra rules: you can't attach to it after it's been created, and upon
8195  * completion of resilvering, the first disk (the one being replaced)
8196  * is automatically detached.
8197  *
8198  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
8199  * should be performed instead of traditional healing reconstruction.  From
8200  * an administrators perspective these are both resilver operations.
8201  */
8202 int
8203 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
8204     int rebuild)
8205 {
8206 	uint64_t txg, dtl_max_txg;
8207 	vdev_t *rvd = spa->spa_root_vdev;
8208 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
8209 	vdev_ops_t *pvops;
8210 	char *oldvdpath, *newvdpath;
8211 	int newvd_isspare = B_FALSE;
8212 	int error;
8213 
8214 	ASSERT(spa_writeable(spa));
8215 
8216 	txg = spa_vdev_enter(spa);
8217 
8218 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
8219 
8220 	ASSERT(spa_namespace_held());
8221 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
8222 		error = (spa_has_checkpoint(spa)) ?
8223 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
8224 		return (spa_vdev_exit(spa, NULL, txg, error));
8225 	}
8226 
8227 	if (rebuild) {
8228 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
8229 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8230 
8231 		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
8232 		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
8233 			return (spa_vdev_exit(spa, NULL, txg,
8234 			    ZFS_ERR_RESILVER_IN_PROGRESS));
8235 		}
8236 	} else {
8237 		if (vdev_rebuild_active(rvd))
8238 			return (spa_vdev_exit(spa, NULL, txg,
8239 			    ZFS_ERR_REBUILD_IN_PROGRESS));
8240 	}
8241 
8242 	if (spa->spa_vdev_removal != NULL) {
8243 		return (spa_vdev_exit(spa, NULL, txg,
8244 		    ZFS_ERR_DEVRM_IN_PROGRESS));
8245 	}
8246 
8247 	if (oldvd == NULL)
8248 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
8249 
8250 	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
8251 
8252 	if (raidz) {
8253 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
8254 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8255 
8256 		/*
8257 		 * Can't expand a raidz while prior expand is in progress.
8258 		 */
8259 		if (spa->spa_raidz_expand != NULL) {
8260 			return (spa_vdev_exit(spa, NULL, txg,
8261 			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
8262 		}
8263 	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
8264 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8265 	}
8266 
8267 	if (raidz)
8268 		pvd = oldvd;
8269 	else
8270 		pvd = oldvd->vdev_parent;
8271 
8272 	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
8273 	    VDEV_ALLOC_ATTACH) != 0)
8274 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
8275 
8276 	if (newrootvd->vdev_children != 1)
8277 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
8278 
8279 	newvd = newrootvd->vdev_child[0];
8280 
8281 	if (!newvd->vdev_ops->vdev_op_leaf)
8282 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
8283 
8284 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
8285 		return (spa_vdev_exit(spa, newrootvd, txg, error));
8286 
8287 	/*
8288 	 * log, dedup and special vdevs should not be replaced by spares.
8289 	 */
8290 	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
8291 	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
8292 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8293 	}
8294 
8295 	/*
8296 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
8297 	 */
8298 	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
8299 	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
8300 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8301 	}
8302 
8303 	if (rebuild) {
8304 		/*
8305 		 * For rebuilds, the top vdev must support reconstruction
8306 		 * using only space maps.  This means the only allowable
8307 		 * vdevs types are the root vdev, a mirror, or dRAID.
8308 		 */
8309 		tvd = pvd;
8310 		if (pvd->vdev_top != NULL)
8311 			tvd = pvd->vdev_top;
8312 
8313 		if (tvd->vdev_ops != &vdev_mirror_ops &&
8314 		    tvd->vdev_ops != &vdev_root_ops &&
8315 		    tvd->vdev_ops != &vdev_draid_ops) {
8316 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8317 		}
8318 	}
8319 
8320 	if (!replacing) {
8321 		/*
8322 		 * For attach, the only allowable parent is a mirror or
8323 		 * the root vdev. A raidz vdev can be attached to, but
8324 		 * you cannot attach to a raidz child.
8325 		 */
8326 		if (pvd->vdev_ops != &vdev_mirror_ops &&
8327 		    pvd->vdev_ops != &vdev_root_ops &&
8328 		    !raidz)
8329 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8330 
8331 		pvops = &vdev_mirror_ops;
8332 	} else {
8333 		/*
8334 		 * Active hot spares can only be replaced by inactive hot
8335 		 * spares.
8336 		 */
8337 		if (pvd->vdev_ops == &vdev_spare_ops &&
8338 		    oldvd->vdev_isspare &&
8339 		    !spa_has_spare(spa, newvd->vdev_guid))
8340 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8341 
8342 		/*
8343 		 * If the source is a hot spare, and the parent isn't already a
8344 		 * spare, then we want to create a new hot spare.  Otherwise, we
8345 		 * want to create a replacing vdev.  The user is not allowed to
8346 		 * attach to a spared vdev child unless the 'isspare' state is
8347 		 * the same (spare replaces spare, non-spare replaces
8348 		 * non-spare).
8349 		 */
8350 		if (pvd->vdev_ops == &vdev_replacing_ops &&
8351 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
8352 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8353 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
8354 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
8355 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8356 		}
8357 
8358 		if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) {
8359 			vdev_dbgmsg(newvd,
8360 			    "disk would create double spares, ignore.");
8361 			return (spa_vdev_exit(spa, newrootvd, txg, EEXIST));
8362 		}
8363 
8364 		if (newvd->vdev_isspare)
8365 			pvops = &vdev_spare_ops;
8366 		else
8367 			pvops = &vdev_replacing_ops;
8368 	}
8369 
8370 	/*
8371 	 * Make sure the new device is big enough.
8372 	 */
8373 	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
8374 	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
8375 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
8376 
8377 	/*
8378 	 * The new device cannot have a higher alignment requirement
8379 	 * than the top-level vdev.
8380 	 */
8381 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
8382 		return (spa_vdev_exit(spa, newrootvd, txg,
8383 		    ZFS_ERR_ASHIFT_MISMATCH));
8384 	}
8385 
8386 	/*
8387 	 * RAIDZ-expansion-specific checks.
8388 	 */
8389 	if (raidz) {
8390 		if (vdev_raidz_attach_check(newvd) != 0)
8391 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
8392 
8393 		/*
8394 		 * Fail early if a child is not healthy or being replaced
8395 		 */
8396 		for (int i = 0; i < oldvd->vdev_children; i++) {
8397 			if (vdev_is_dead(oldvd->vdev_child[i]) ||
8398 			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
8399 				return (spa_vdev_exit(spa, newrootvd, txg,
8400 				    ENXIO));
8401 			}
8402 			/* Also fail if reserved boot area is in-use */
8403 			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
8404 			    != 0) {
8405 				return (spa_vdev_exit(spa, newrootvd, txg,
8406 				    EADDRINUSE));
8407 			}
8408 		}
8409 	}
8410 
8411 	if (raidz) {
8412 		/*
8413 		 * Note: oldvdpath is freed by spa_strfree(),  but
8414 		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
8415 		 * move it to a spa_strdup-ed string.
8416 		 */
8417 		char *tmp = kmem_asprintf("raidz%u-%u",
8418 		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
8419 		oldvdpath = spa_strdup(tmp);
8420 		kmem_strfree(tmp);
8421 	} else {
8422 		oldvdpath = spa_strdup(oldvd->vdev_path);
8423 	}
8424 	newvdpath = spa_strdup(newvd->vdev_path);
8425 
8426 	/*
8427 	 * If this is an in-place replacement, update oldvd's path and devid
8428 	 * to make it distinguishable from newvd, and unopenable from now on.
8429 	 */
8430 	if (strcmp(oldvdpath, newvdpath) == 0) {
8431 		spa_strfree(oldvd->vdev_path);
8432 		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
8433 		    KM_SLEEP);
8434 		(void) sprintf(oldvd->vdev_path, "%s/old",
8435 		    newvdpath);
8436 		if (oldvd->vdev_devid != NULL) {
8437 			spa_strfree(oldvd->vdev_devid);
8438 			oldvd->vdev_devid = NULL;
8439 		}
8440 		spa_strfree(oldvdpath);
8441 		oldvdpath = spa_strdup(oldvd->vdev_path);
8442 	}
8443 
8444 	/*
8445 	 * If the parent is not a mirror, or if we're replacing, insert the new
8446 	 * mirror/replacing/spare vdev above oldvd.
8447 	 */
8448 	if (!raidz && pvd->vdev_ops != pvops) {
8449 		pvd = vdev_add_parent(oldvd, pvops);
8450 		ASSERT(pvd->vdev_ops == pvops);
8451 		ASSERT(oldvd->vdev_parent == pvd);
8452 	}
8453 
8454 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
8455 
8456 	/*
8457 	 * Extract the new device from its root and add it to pvd.
8458 	 */
8459 	vdev_remove_child(newrootvd, newvd);
8460 	newvd->vdev_id = pvd->vdev_children;
8461 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
8462 	vdev_add_child(pvd, newvd);
8463 
8464 	/*
8465 	 * Reevaluate the parent vdev state.
8466 	 */
8467 	vdev_propagate_state(pvd);
8468 
8469 	tvd = newvd->vdev_top;
8470 	ASSERT(pvd->vdev_top == tvd);
8471 	ASSERT(tvd->vdev_parent == rvd);
8472 
8473 	vdev_config_dirty(tvd);
8474 
8475 	/*
8476 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
8477 	 * for any dmu_sync-ed blocks.  It will propagate upward when
8478 	 * spa_vdev_exit() calls vdev_dtl_reassess().
8479 	 */
8480 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
8481 
8482 	if (raidz) {
8483 		/*
8484 		 * Wait for the youngest allocations and frees to sync,
8485 		 * and then wait for the deferral of those frees to finish.
8486 		 */
8487 		spa_vdev_config_exit(spa, NULL,
8488 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
8489 
8490 		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
8491 		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
8492 		vdev_autotrim_stop_wait(tvd);
8493 
8494 		dtl_max_txg = spa_vdev_config_enter(spa);
8495 
8496 		tvd->vdev_rz_expanding = B_TRUE;
8497 
8498 		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
8499 		vdev_config_dirty(tvd);
8500 
8501 		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
8502 		    dtl_max_txg);
8503 		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
8504 		    newvd, tx);
8505 		dmu_tx_commit(tx);
8506 	} else {
8507 		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
8508 		    dtl_max_txg - TXG_INITIAL);
8509 
8510 		if (newvd->vdev_isspare) {
8511 			spa_spare_activate(newvd);
8512 			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
8513 		}
8514 
8515 		newvd_isspare = newvd->vdev_isspare;
8516 
8517 		/*
8518 		 * Mark newvd's DTL dirty in this txg.
8519 		 */
8520 		vdev_dirty(tvd, VDD_DTL, newvd, txg);
8521 
8522 		/*
8523 		 * Schedule the resilver or rebuild to restart in the future.
8524 		 * We do this to ensure that dmu_sync-ed blocks have been
8525 		 * stitched into the respective datasets.
8526 		 */
8527 		if (rebuild) {
8528 			newvd->vdev_rebuild_txg = txg;
8529 
8530 			vdev_rebuild(tvd, txg);
8531 		} else {
8532 			newvd->vdev_resilver_txg = txg;
8533 
8534 			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
8535 			    spa_feature_is_enabled(spa,
8536 			    SPA_FEATURE_RESILVER_DEFER)) {
8537 				vdev_defer_resilver(newvd);
8538 			} else {
8539 				dsl_scan_restart_resilver(spa->spa_dsl_pool,
8540 				    dtl_max_txg);
8541 			}
8542 		}
8543 	}
8544 
8545 	if (spa->spa_bootfs)
8546 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
8547 
8548 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
8549 
8550 	/*
8551 	 * Commit the config
8552 	 */
8553 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
8554 
8555 	spa_history_log_internal(spa, "vdev attach", NULL,
8556 	    "%s vdev=%s %s vdev=%s",
8557 	    replacing && newvd_isspare ? "spare in" :
8558 	    replacing ? "replace" : "attach", newvdpath,
8559 	    replacing ? "for" : "to", oldvdpath);
8560 
8561 	spa_strfree(oldvdpath);
8562 	spa_strfree(newvdpath);
8563 
8564 	return (0);
8565 }
8566 
8567 /*
8568  * Detach a device from a mirror or replacing vdev.
8569  *
8570  * If 'replace_done' is specified, only detach if the parent
8571  * is a replacing or a spare vdev.
8572  */
8573 int
8574 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
8575 {
8576 	uint64_t txg;
8577 	int error;
8578 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
8579 	vdev_t *vd, *pvd, *cvd, *tvd;
8580 	boolean_t unspare = B_FALSE;
8581 	uint64_t unspare_guid = 0;
8582 	char *vdpath;
8583 
8584 	ASSERT(spa_writeable(spa));
8585 
8586 	txg = spa_vdev_detach_enter(spa, guid);
8587 
8588 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
8589 
8590 	/*
8591 	 * Besides being called directly from the userland through the
8592 	 * ioctl interface, spa_vdev_detach() can be potentially called
8593 	 * at the end of spa_vdev_resilver_done().
8594 	 *
8595 	 * In the regular case, when we have a checkpoint this shouldn't
8596 	 * happen as we never empty the DTLs of a vdev during the scrub
8597 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
8598 	 * should never get here when we have a checkpoint.
8599 	 *
8600 	 * That said, even in a case when we checkpoint the pool exactly
8601 	 * as spa_vdev_resilver_done() calls this function everything
8602 	 * should be fine as the resilver will return right away.
8603 	 */
8604 	ASSERT(spa_namespace_held());
8605 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
8606 		error = (spa_has_checkpoint(spa)) ?
8607 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
8608 		return (spa_vdev_exit(spa, NULL, txg, error));
8609 	}
8610 
8611 	if (vd == NULL)
8612 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
8613 
8614 	if (!vd->vdev_ops->vdev_op_leaf)
8615 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8616 
8617 	pvd = vd->vdev_parent;
8618 
8619 	/*
8620 	 * If the parent/child relationship is not as expected, don't do it.
8621 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
8622 	 * vdev that's replacing B with C.  The user's intent in replacing
8623 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
8624 	 * the replace by detaching C, the expected behavior is to end up
8625 	 * M(A,B).  But suppose that right after deciding to detach C,
8626 	 * the replacement of B completes.  We would have M(A,C), and then
8627 	 * ask to detach C, which would leave us with just A -- not what
8628 	 * the user wanted.  To prevent this, we make sure that the
8629 	 * parent/child relationship hasn't changed -- in this example,
8630 	 * that C's parent is still the replacing vdev R.
8631 	 */
8632 	if (pvd->vdev_guid != pguid && pguid != 0)
8633 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
8634 
8635 	/*
8636 	 * Only 'replacing' or 'spare' vdevs can be replaced.
8637 	 */
8638 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
8639 	    pvd->vdev_ops != &vdev_spare_ops)
8640 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8641 
8642 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
8643 	    spa_version(spa) >= SPA_VERSION_SPARES);
8644 
8645 	/*
8646 	 * Only mirror, replacing, and spare vdevs support detach.
8647 	 */
8648 	if (pvd->vdev_ops != &vdev_replacing_ops &&
8649 	    pvd->vdev_ops != &vdev_mirror_ops &&
8650 	    pvd->vdev_ops != &vdev_spare_ops)
8651 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
8652 
8653 	/*
8654 	 * If this device has the only valid copy of some data,
8655 	 * we cannot safely detach it.
8656 	 */
8657 	if (vdev_dtl_required(vd))
8658 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
8659 
8660 	ASSERT(pvd->vdev_children >= 2);
8661 
8662 	/*
8663 	 * If we are detaching the second disk from a replacing vdev, then
8664 	 * check to see if we changed the original vdev's path to have "/old"
8665 	 * at the end in spa_vdev_attach().  If so, undo that change now.
8666 	 */
8667 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
8668 	    vd->vdev_path != NULL) {
8669 		size_t len = strlen(vd->vdev_path);
8670 
8671 		for (int c = 0; c < pvd->vdev_children; c++) {
8672 			cvd = pvd->vdev_child[c];
8673 
8674 			if (cvd == vd || cvd->vdev_path == NULL)
8675 				continue;
8676 
8677 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
8678 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
8679 				spa_strfree(cvd->vdev_path);
8680 				cvd->vdev_path = spa_strdup(vd->vdev_path);
8681 				break;
8682 			}
8683 		}
8684 	}
8685 
8686 	/*
8687 	 * If we are detaching the original disk from a normal spare, then it
8688 	 * implies that the spare should become a real disk, and be removed
8689 	 * from the active spare list for the pool.  dRAID spares on the
8690 	 * other hand are coupled to the pool and thus should never be removed
8691 	 * from the spares list.
8692 	 */
8693 	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
8694 		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
8695 
8696 		if (last_cvd->vdev_isspare &&
8697 		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
8698 			unspare = B_TRUE;
8699 		}
8700 	}
8701 
8702 	/*
8703 	 * Erase the disk labels so the disk can be used for other things.
8704 	 * This must be done after all other error cases are handled,
8705 	 * but before we disembowel vd (so we can still do I/O to it).
8706 	 * But if we can't do it, don't treat the error as fatal --
8707 	 * it may be that the unwritability of the disk is the reason
8708 	 * it's being detached!
8709 	 */
8710 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
8711 
8712 	/*
8713 	 * Remove vd from its parent and compact the parent's children.
8714 	 */
8715 	vdev_remove_child(pvd, vd);
8716 	vdev_compact_children(pvd);
8717 
8718 	/*
8719 	 * Remember one of the remaining children so we can get tvd below.
8720 	 */
8721 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
8722 
8723 	/*
8724 	 * If we need to remove the remaining child from the list of hot spares,
8725 	 * do it now, marking the vdev as no longer a spare in the process.
8726 	 * We must do this before vdev_remove_parent(), because that can
8727 	 * change the GUID if it creates a new toplevel GUID.  For a similar
8728 	 * reason, we must remove the spare now, in the same txg as the detach;
8729 	 * otherwise someone could attach a new sibling, change the GUID, and
8730 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
8731 	 */
8732 	if (unspare) {
8733 		ASSERT(cvd->vdev_isspare);
8734 		spa_spare_remove(cvd);
8735 		unspare_guid = cvd->vdev_guid;
8736 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
8737 		cvd->vdev_unspare = B_TRUE;
8738 	}
8739 
8740 	/*
8741 	 * If the parent mirror/replacing vdev only has one child,
8742 	 * the parent is no longer needed.  Remove it from the tree.
8743 	 */
8744 	if (pvd->vdev_children == 1) {
8745 		if (pvd->vdev_ops == &vdev_spare_ops)
8746 			cvd->vdev_unspare = B_FALSE;
8747 		vdev_remove_parent(cvd);
8748 	}
8749 
8750 	/*
8751 	 * We don't set tvd until now because the parent we just removed
8752 	 * may have been the previous top-level vdev.
8753 	 */
8754 	tvd = cvd->vdev_top;
8755 	ASSERT(tvd->vdev_parent == rvd);
8756 
8757 	/*
8758 	 * Reevaluate the parent vdev state.
8759 	 */
8760 	vdev_propagate_state(cvd);
8761 
8762 	/*
8763 	 * If the 'autoexpand' property is set on the pool then automatically
8764 	 * try to expand the size of the pool. For example if the device we
8765 	 * just detached was smaller than the others, it may be possible to
8766 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
8767 	 * first so that we can obtain the updated sizes of the leaf vdevs.
8768 	 */
8769 	if (spa->spa_autoexpand) {
8770 		vdev_reopen(tvd);
8771 		vdev_expand(tvd, txg);
8772 	}
8773 
8774 	vdev_config_dirty(tvd);
8775 
8776 	/*
8777 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
8778 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
8779 	 * But first make sure we're not on any *other* txg's DTL list, to
8780 	 * prevent vd from being accessed after it's freed.
8781 	 */
8782 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
8783 	for (int t = 0; t < TXG_SIZE; t++)
8784 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
8785 	vd->vdev_detached = B_TRUE;
8786 	vdev_dirty(tvd, VDD_DTL, vd, txg);
8787 
8788 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
8789 	spa_notify_waiters(spa);
8790 
8791 	/* hang on to the spa before we release the lock */
8792 	spa_open_ref(spa, FTAG);
8793 
8794 	error = spa_vdev_exit(spa, vd, txg, 0);
8795 
8796 	spa_history_log_internal(spa, "detach", NULL,
8797 	    "vdev=%s", vdpath);
8798 	spa_strfree(vdpath);
8799 
8800 	/*
8801 	 * If this was the removal of the original device in a hot spare vdev,
8802 	 * then we want to go through and remove the device from the hot spare
8803 	 * list of every other pool.
8804 	 */
8805 	if (unspare) {
8806 		spa_t *altspa = NULL;
8807 
8808 		spa_namespace_enter(FTAG);
8809 		while ((altspa = spa_next(altspa)) != NULL) {
8810 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
8811 			    altspa == spa)
8812 				continue;
8813 
8814 			spa_open_ref(altspa, FTAG);
8815 			spa_namespace_exit(FTAG);
8816 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
8817 			spa_namespace_enter(FTAG);
8818 			spa_close(altspa, FTAG);
8819 		}
8820 		spa_namespace_exit(FTAG);
8821 
8822 		/* search the rest of the vdevs for spares to remove */
8823 		spa_vdev_resilver_done(spa);
8824 	}
8825 
8826 	/* all done with the spa; OK to release */
8827 	spa_namespace_enter(FTAG);
8828 	spa_close(spa, FTAG);
8829 	spa_namespace_exit(FTAG);
8830 
8831 	return (error);
8832 }
8833 
8834 static int
8835 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
8836     list_t *vd_list)
8837 {
8838 	ASSERT(spa_namespace_held());
8839 
8840 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
8841 
8842 	/* Look up vdev and ensure it's a leaf. */
8843 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
8844 	if (vd == NULL || vd->vdev_detached) {
8845 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8846 		return (SET_ERROR(ENODEV));
8847 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
8848 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8849 		return (SET_ERROR(EINVAL));
8850 	} else if (!vdev_writeable(vd)) {
8851 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8852 		return (SET_ERROR(EROFS));
8853 	}
8854 	mutex_enter(&vd->vdev_initialize_lock);
8855 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8856 
8857 	/*
8858 	 * When we activate an initialize action we check to see
8859 	 * if the vdev_initialize_thread is NULL. We do this instead
8860 	 * of using the vdev_initialize_state since there might be
8861 	 * a previous initialization process which has completed but
8862 	 * the thread is not exited.
8863 	 */
8864 	if (cmd_type == POOL_INITIALIZE_START &&
8865 	    (vd->vdev_initialize_thread != NULL ||
8866 	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
8867 		mutex_exit(&vd->vdev_initialize_lock);
8868 		return (SET_ERROR(EBUSY));
8869 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
8870 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
8871 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
8872 		mutex_exit(&vd->vdev_initialize_lock);
8873 		return (SET_ERROR(ESRCH));
8874 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
8875 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
8876 		mutex_exit(&vd->vdev_initialize_lock);
8877 		return (SET_ERROR(ESRCH));
8878 	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
8879 	    vd->vdev_initialize_thread != NULL) {
8880 		mutex_exit(&vd->vdev_initialize_lock);
8881 		return (SET_ERROR(EBUSY));
8882 	}
8883 
8884 	switch (cmd_type) {
8885 	case POOL_INITIALIZE_START:
8886 		vdev_initialize(vd);
8887 		break;
8888 	case POOL_INITIALIZE_CANCEL:
8889 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
8890 		break;
8891 	case POOL_INITIALIZE_SUSPEND:
8892 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
8893 		break;
8894 	case POOL_INITIALIZE_UNINIT:
8895 		vdev_uninitialize(vd);
8896 		break;
8897 	default:
8898 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
8899 	}
8900 	mutex_exit(&vd->vdev_initialize_lock);
8901 
8902 	return (0);
8903 }
8904 
8905 int
8906 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
8907     nvlist_t *vdev_errlist)
8908 {
8909 	int total_errors = 0;
8910 	list_t vd_list;
8911 
8912 	list_create(&vd_list, sizeof (vdev_t),
8913 	    offsetof(vdev_t, vdev_initialize_node));
8914 
8915 	/*
8916 	 * We hold the namespace lock through the whole function
8917 	 * to prevent any changes to the pool while we're starting or
8918 	 * stopping initialization. The config and state locks are held so that
8919 	 * we can properly assess the vdev state before we commit to
8920 	 * the initializing operation.
8921 	 */
8922 	spa_namespace_enter(FTAG);
8923 
8924 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
8925 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
8926 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
8927 
8928 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
8929 		    &vd_list);
8930 		if (error != 0) {
8931 			char guid_as_str[MAXNAMELEN];
8932 
8933 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
8934 			    "%llu", (unsigned long long)vdev_guid);
8935 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
8936 			total_errors++;
8937 		}
8938 	}
8939 
8940 	/* Wait for all initialize threads to stop. */
8941 	vdev_initialize_stop_wait(spa, &vd_list);
8942 
8943 	/* Sync out the initializing state */
8944 	txg_wait_synced(spa->spa_dsl_pool, 0);
8945 	spa_namespace_exit(FTAG);
8946 
8947 	list_destroy(&vd_list);
8948 
8949 	return (total_errors);
8950 }
8951 
8952 static int
8953 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
8954     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
8955 {
8956 	ASSERT(spa_namespace_held());
8957 
8958 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
8959 
8960 	/* Look up vdev and ensure it's a leaf. */
8961 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
8962 	if (vd == NULL || vd->vdev_detached) {
8963 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8964 		return (SET_ERROR(ENODEV));
8965 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
8966 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8967 		return (SET_ERROR(EINVAL));
8968 	} else if (!vdev_writeable(vd)) {
8969 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8970 		return (SET_ERROR(EROFS));
8971 	} else if (!vd->vdev_has_trim) {
8972 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8973 		return (SET_ERROR(EOPNOTSUPP));
8974 	} else if (secure && !vd->vdev_has_securetrim) {
8975 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8976 		return (SET_ERROR(EOPNOTSUPP));
8977 	}
8978 	mutex_enter(&vd->vdev_trim_lock);
8979 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8980 
8981 	/*
8982 	 * When we activate a TRIM action we check to see if the
8983 	 * vdev_trim_thread is NULL. We do this instead of using the
8984 	 * vdev_trim_state since there might be a previous TRIM process
8985 	 * which has completed but the thread is not exited.
8986 	 */
8987 	if (cmd_type == POOL_TRIM_START &&
8988 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
8989 	    vd->vdev_top->vdev_rz_expanding)) {
8990 		mutex_exit(&vd->vdev_trim_lock);
8991 		return (SET_ERROR(EBUSY));
8992 	} else if (cmd_type == POOL_TRIM_CANCEL &&
8993 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
8994 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
8995 		mutex_exit(&vd->vdev_trim_lock);
8996 		return (SET_ERROR(ESRCH));
8997 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
8998 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
8999 		mutex_exit(&vd->vdev_trim_lock);
9000 		return (SET_ERROR(ESRCH));
9001 	}
9002 
9003 	switch (cmd_type) {
9004 	case POOL_TRIM_START:
9005 		vdev_trim(vd, rate, partial, secure);
9006 		break;
9007 	case POOL_TRIM_CANCEL:
9008 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
9009 		break;
9010 	case POOL_TRIM_SUSPEND:
9011 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
9012 		break;
9013 	default:
9014 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
9015 	}
9016 	mutex_exit(&vd->vdev_trim_lock);
9017 
9018 	return (0);
9019 }
9020 
9021 /*
9022  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
9023  * TRIM threads for each child vdev.  These threads pass over all of the free
9024  * space in the vdev's metaslabs and issues TRIM commands for that space.
9025  */
9026 int
9027 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
9028     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
9029 {
9030 	int total_errors = 0;
9031 	list_t vd_list;
9032 
9033 	list_create(&vd_list, sizeof (vdev_t),
9034 	    offsetof(vdev_t, vdev_trim_node));
9035 
9036 	/*
9037 	 * We hold the namespace lock through the whole function
9038 	 * to prevent any changes to the pool while we're starting or
9039 	 * stopping TRIM. The config and state locks are held so that
9040 	 * we can properly assess the vdev state before we commit to
9041 	 * the TRIM operation.
9042 	 */
9043 	spa_namespace_enter(FTAG);
9044 
9045 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
9046 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
9047 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
9048 
9049 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
9050 		    rate, partial, secure, &vd_list);
9051 		if (error != 0) {
9052 			char guid_as_str[MAXNAMELEN];
9053 
9054 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
9055 			    "%llu", (unsigned long long)vdev_guid);
9056 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
9057 			total_errors++;
9058 		}
9059 	}
9060 
9061 	/* Wait for all TRIM threads to stop. */
9062 	vdev_trim_stop_wait(spa, &vd_list);
9063 
9064 	/* Sync out the TRIM state */
9065 	txg_wait_synced(spa->spa_dsl_pool, 0);
9066 	spa_namespace_exit(FTAG);
9067 
9068 	list_destroy(&vd_list);
9069 
9070 	return (total_errors);
9071 }
9072 
9073 /*
9074  * Split a set of devices from their mirrors, and create a new pool from them.
9075  */
9076 int
9077 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
9078     nvlist_t *props, boolean_t exp)
9079 {
9080 	int error = 0;
9081 	uint64_t txg, *glist;
9082 	spa_t *newspa;
9083 	uint_t c, children, lastlog;
9084 	nvlist_t **child, *nvl, *tmp;
9085 	dmu_tx_t *tx;
9086 	const char *altroot = NULL;
9087 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
9088 	boolean_t activate_slog;
9089 
9090 	ASSERT(spa_writeable(spa));
9091 
9092 	txg = spa_vdev_enter(spa);
9093 
9094 	ASSERT(spa_namespace_held());
9095 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
9096 		error = (spa_has_checkpoint(spa)) ?
9097 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
9098 		return (spa_vdev_exit(spa, NULL, txg, error));
9099 	}
9100 
9101 	/* clear the log and flush everything up to now */
9102 	activate_slog = spa_passivate_log(spa);
9103 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
9104 	error = spa_reset_logs(spa);
9105 	txg = spa_vdev_config_enter(spa);
9106 
9107 	if (activate_slog)
9108 		spa_activate_log(spa);
9109 
9110 	if (error != 0)
9111 		return (spa_vdev_exit(spa, NULL, txg, error));
9112 
9113 	/* check new spa name before going any further */
9114 	if (spa_lookup(newname) != NULL)
9115 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
9116 
9117 	/*
9118 	 * scan through all the children to ensure they're all mirrors
9119 	 */
9120 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
9121 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
9122 	    &children) != 0)
9123 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
9124 
9125 	/* first, check to ensure we've got the right child count */
9126 	rvd = spa->spa_root_vdev;
9127 	lastlog = 0;
9128 	for (c = 0; c < rvd->vdev_children; c++) {
9129 		vdev_t *vd = rvd->vdev_child[c];
9130 
9131 		/* don't count the holes & logs as children */
9132 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
9133 		    !vdev_is_concrete(vd))) {
9134 			if (lastlog == 0)
9135 				lastlog = c;
9136 			continue;
9137 		}
9138 
9139 		lastlog = 0;
9140 	}
9141 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
9142 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
9143 
9144 	/* next, ensure no spare or cache devices are part of the split */
9145 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
9146 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
9147 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
9148 
9149 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
9150 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
9151 
9152 	/* then, loop over each vdev and validate it */
9153 	for (c = 0; c < children; c++) {
9154 		uint64_t is_hole = 0;
9155 
9156 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
9157 		    &is_hole);
9158 
9159 		if (is_hole != 0) {
9160 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
9161 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
9162 				continue;
9163 			} else {
9164 				error = SET_ERROR(EINVAL);
9165 				break;
9166 			}
9167 		}
9168 
9169 		/* deal with indirect vdevs */
9170 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
9171 		    &vdev_indirect_ops)
9172 			continue;
9173 
9174 		/* which disk is going to be split? */
9175 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
9176 		    &glist[c]) != 0) {
9177 			error = SET_ERROR(EINVAL);
9178 			break;
9179 		}
9180 
9181 		/* look it up in the spa */
9182 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
9183 		if (vml[c] == NULL) {
9184 			error = SET_ERROR(ENODEV);
9185 			break;
9186 		}
9187 
9188 		/* make sure there's nothing stopping the split */
9189 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
9190 		    vml[c]->vdev_islog ||
9191 		    !vdev_is_concrete(vml[c]) ||
9192 		    vml[c]->vdev_isspare ||
9193 		    vml[c]->vdev_isl2cache ||
9194 		    !vdev_writeable(vml[c]) ||
9195 		    vml[c]->vdev_children != 0 ||
9196 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
9197 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
9198 			error = SET_ERROR(EINVAL);
9199 			break;
9200 		}
9201 
9202 		if (vdev_dtl_required(vml[c]) ||
9203 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
9204 			error = SET_ERROR(EBUSY);
9205 			break;
9206 		}
9207 
9208 		/* we need certain info from the top level */
9209 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
9210 		    vml[c]->vdev_top->vdev_ms_array);
9211 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
9212 		    vml[c]->vdev_top->vdev_ms_shift);
9213 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
9214 		    vml[c]->vdev_top->vdev_asize);
9215 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
9216 		    vml[c]->vdev_top->vdev_ashift);
9217 
9218 		/* transfer per-vdev ZAPs */
9219 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
9220 		VERIFY0(nvlist_add_uint64(child[c],
9221 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
9222 
9223 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
9224 		VERIFY0(nvlist_add_uint64(child[c],
9225 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
9226 		    vml[c]->vdev_parent->vdev_top_zap));
9227 	}
9228 
9229 	if (error != 0) {
9230 		kmem_free(vml, children * sizeof (vdev_t *));
9231 		kmem_free(glist, children * sizeof (uint64_t));
9232 		return (spa_vdev_exit(spa, NULL, txg, error));
9233 	}
9234 
9235 	/* stop writers from using the disks */
9236 	for (c = 0; c < children; c++) {
9237 		if (vml[c] != NULL)
9238 			vml[c]->vdev_offline = B_TRUE;
9239 	}
9240 	vdev_reopen(spa->spa_root_vdev);
9241 
9242 	/*
9243 	 * Temporarily record the splitting vdevs in the spa config.  This
9244 	 * will disappear once the config is regenerated.
9245 	 */
9246 	nvl = fnvlist_alloc();
9247 	fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
9248 	kmem_free(glist, children * sizeof (uint64_t));
9249 
9250 	mutex_enter(&spa->spa_props_lock);
9251 	fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
9252 	mutex_exit(&spa->spa_props_lock);
9253 	spa->spa_config_splitting = nvl;
9254 	vdev_config_dirty(spa->spa_root_vdev);
9255 
9256 	/* configure and create the new pool */
9257 	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
9258 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
9259 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
9260 	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
9261 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
9262 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
9263 	    spa_generate_guid(NULL));
9264 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
9265 	(void) nvlist_lookup_string(props,
9266 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
9267 
9268 	/* add the new pool to the namespace */
9269 	newspa = spa_add(newname, config, altroot);
9270 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
9271 	newspa->spa_config_txg = spa->spa_config_txg;
9272 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
9273 
9274 	/* release the spa config lock, retaining the namespace lock */
9275 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
9276 
9277 	if (zio_injection_enabled)
9278 		zio_handle_panic_injection(spa, FTAG, 1);
9279 
9280 	spa_activate(newspa, spa_mode_global);
9281 	spa_async_suspend(newspa);
9282 
9283 	/*
9284 	 * Temporarily stop the initializing and TRIM activity.  We set the
9285 	 * state to ACTIVE so that we know to resume initializing or TRIM
9286 	 * once the split has completed.
9287 	 */
9288 	list_t vd_initialize_list;
9289 	list_create(&vd_initialize_list, sizeof (vdev_t),
9290 	    offsetof(vdev_t, vdev_initialize_node));
9291 
9292 	list_t vd_trim_list;
9293 	list_create(&vd_trim_list, sizeof (vdev_t),
9294 	    offsetof(vdev_t, vdev_trim_node));
9295 
9296 	for (c = 0; c < children; c++) {
9297 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
9298 			mutex_enter(&vml[c]->vdev_initialize_lock);
9299 			vdev_initialize_stop(vml[c],
9300 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
9301 			mutex_exit(&vml[c]->vdev_initialize_lock);
9302 
9303 			mutex_enter(&vml[c]->vdev_trim_lock);
9304 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
9305 			mutex_exit(&vml[c]->vdev_trim_lock);
9306 		}
9307 	}
9308 
9309 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
9310 	vdev_trim_stop_wait(spa, &vd_trim_list);
9311 
9312 	list_destroy(&vd_initialize_list);
9313 	list_destroy(&vd_trim_list);
9314 
9315 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
9316 	newspa->spa_is_splitting = B_TRUE;
9317 
9318 	/* create the new pool from the disks of the original pool */
9319 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
9320 	if (error)
9321 		goto out;
9322 
9323 	/* if that worked, generate a real config for the new pool */
9324 	if (newspa->spa_root_vdev != NULL) {
9325 		newspa->spa_config_splitting = fnvlist_alloc();
9326 		fnvlist_add_uint64(newspa->spa_config_splitting,
9327 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
9328 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
9329 		    B_TRUE));
9330 	}
9331 
9332 	/* set the props */
9333 	if (props != NULL) {
9334 		spa_configfile_set(newspa, props, B_FALSE);
9335 		error = spa_prop_set(newspa, props);
9336 		if (error)
9337 			goto out;
9338 	}
9339 
9340 	/* flush everything */
9341 	txg = spa_vdev_config_enter(newspa);
9342 	vdev_config_dirty(newspa->spa_root_vdev);
9343 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
9344 
9345 	if (zio_injection_enabled)
9346 		zio_handle_panic_injection(spa, FTAG, 2);
9347 
9348 	spa_async_resume(newspa);
9349 
9350 	/* finally, update the original pool's config */
9351 	txg = spa_vdev_config_enter(spa);
9352 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
9353 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
9354 	if (error != 0)
9355 		dmu_tx_abort(tx);
9356 	for (c = 0; c < children; c++) {
9357 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
9358 			vdev_t *tvd = vml[c]->vdev_top;
9359 
9360 			/*
9361 			 * Need to be sure the detachable VDEV is not
9362 			 * on any *other* txg's DTL list to prevent it
9363 			 * from being accessed after it's freed.
9364 			 */
9365 			for (int t = 0; t < TXG_SIZE; t++) {
9366 				(void) txg_list_remove_this(
9367 				    &tvd->vdev_dtl_list, vml[c], t);
9368 			}
9369 
9370 			vdev_split(vml[c]);
9371 			if (error == 0)
9372 				spa_history_log_internal(spa, "detach", tx,
9373 				    "vdev=%s", vml[c]->vdev_path);
9374 
9375 			vdev_free(vml[c]);
9376 		}
9377 	}
9378 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
9379 	vdev_config_dirty(spa->spa_root_vdev);
9380 	spa->spa_config_splitting = NULL;
9381 	nvlist_free(nvl);
9382 	if (error == 0)
9383 		dmu_tx_commit(tx);
9384 	(void) spa_vdev_exit(spa, NULL, txg, 0);
9385 
9386 	if (zio_injection_enabled)
9387 		zio_handle_panic_injection(spa, FTAG, 3);
9388 
9389 	/* split is complete; log a history record */
9390 	spa_history_log_internal(newspa, "split", NULL,
9391 	    "from pool %s", spa_name(spa));
9392 
9393 	newspa->spa_is_splitting = B_FALSE;
9394 	kmem_free(vml, children * sizeof (vdev_t *));
9395 
9396 	/* if we're not going to mount the filesystems in userland, export */
9397 	if (exp)
9398 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
9399 		    B_FALSE, B_FALSE);
9400 
9401 	return (error);
9402 
9403 out:
9404 	spa_unload(newspa);
9405 	spa_deactivate(newspa);
9406 	spa_remove(newspa);
9407 
9408 	txg = spa_vdev_config_enter(spa);
9409 
9410 	/* re-online all offlined disks */
9411 	for (c = 0; c < children; c++) {
9412 		if (vml[c] != NULL)
9413 			vml[c]->vdev_offline = B_FALSE;
9414 	}
9415 
9416 	/* restart initializing or trimming disks as necessary */
9417 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
9418 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
9419 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
9420 
9421 	vdev_reopen(spa->spa_root_vdev);
9422 
9423 	nvlist_free(spa->spa_config_splitting);
9424 	spa->spa_config_splitting = NULL;
9425 	(void) spa_vdev_exit(spa, NULL, txg, error);
9426 
9427 	kmem_free(vml, children * sizeof (vdev_t *));
9428 	return (error);
9429 }
9430 
9431 /*
9432  * Find any device that's done replacing, or a vdev marked 'unspare' that's
9433  * currently spared, so we can detach it.
9434  */
9435 static vdev_t *
9436 spa_vdev_resilver_done_hunt(vdev_t *vd)
9437 {
9438 	vdev_t *newvd, *oldvd;
9439 
9440 	for (int c = 0; c < vd->vdev_children; c++) {
9441 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
9442 		if (oldvd != NULL)
9443 			return (oldvd);
9444 	}
9445 
9446 	/*
9447 	 * Check for a completed replacement.  We always consider the first
9448 	 * vdev in the list to be the oldest vdev, and the last one to be
9449 	 * the newest (see spa_vdev_attach() for how that works).  In
9450 	 * the case where the newest vdev is faulted, we will not automatically
9451 	 * remove it after a resilver completes.  This is OK as it will require
9452 	 * user intervention to determine which disk the admin wishes to keep.
9453 	 */
9454 	if (vd->vdev_ops == &vdev_replacing_ops) {
9455 		ASSERT(vd->vdev_children > 1);
9456 
9457 		newvd = vd->vdev_child[vd->vdev_children - 1];
9458 		oldvd = vd->vdev_child[0];
9459 
9460 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
9461 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
9462 		    !vdev_dtl_required(oldvd))
9463 			return (oldvd);
9464 	}
9465 
9466 	/*
9467 	 * Check for a completed resilver with the 'unspare' flag set.
9468 	 * Also potentially update faulted state.
9469 	 */
9470 	if (vd->vdev_ops == &vdev_spare_ops) {
9471 		vdev_t *first = vd->vdev_child[0];
9472 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
9473 
9474 		if (last->vdev_unspare) {
9475 			oldvd = first;
9476 			newvd = last;
9477 		} else if (first->vdev_unspare) {
9478 			oldvd = last;
9479 			newvd = first;
9480 		} else {
9481 			oldvd = NULL;
9482 		}
9483 
9484 		if (oldvd != NULL &&
9485 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
9486 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
9487 		    !vdev_dtl_required(oldvd))
9488 			return (oldvd);
9489 
9490 		vdev_propagate_state(vd);
9491 
9492 		/*
9493 		 * If there are more than two spares attached to a disk,
9494 		 * and those spares are not required, then we want to
9495 		 * attempt to free them up now so that they can be used
9496 		 * by other pools.  Once we're back down to a single
9497 		 * disk+spare, we stop removing them.
9498 		 */
9499 		if (vd->vdev_children > 2) {
9500 			newvd = vd->vdev_child[1];
9501 
9502 			if (newvd->vdev_isspare && last->vdev_isspare &&
9503 			    vdev_dtl_empty(last, DTL_MISSING) &&
9504 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
9505 			    !vdev_dtl_required(newvd))
9506 				return (newvd);
9507 		}
9508 	}
9509 
9510 	return (NULL);
9511 }
9512 
9513 static void
9514 spa_vdev_resilver_done(spa_t *spa)
9515 {
9516 	vdev_t *vd, *pvd, *ppvd;
9517 	uint64_t guid, sguid, pguid, ppguid;
9518 
9519 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9520 
9521 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
9522 		pvd = vd->vdev_parent;
9523 		ppvd = pvd->vdev_parent;
9524 		guid = vd->vdev_guid;
9525 		pguid = pvd->vdev_guid;
9526 		ppguid = ppvd->vdev_guid;
9527 		sguid = 0;
9528 		/*
9529 		 * If we have just finished replacing a hot spared device, then
9530 		 * we need to detach the parent's first child (the original hot
9531 		 * spare) as well.
9532 		 */
9533 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
9534 		    ppvd->vdev_children == 2) {
9535 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
9536 			sguid = ppvd->vdev_child[1]->vdev_guid;
9537 		}
9538 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
9539 
9540 		spa_config_exit(spa, SCL_ALL, FTAG);
9541 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
9542 			return;
9543 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
9544 			return;
9545 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
9546 	}
9547 
9548 	spa_config_exit(spa, SCL_ALL, FTAG);
9549 
9550 	/*
9551 	 * If a detach was not performed above replace waiters will not have
9552 	 * been notified.  In which case we must do so now.
9553 	 */
9554 	spa_notify_waiters(spa);
9555 }
9556 
9557 /*
9558  * Update the stored path or FRU for this vdev.
9559  */
9560 static int
9561 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
9562     boolean_t ispath)
9563 {
9564 	vdev_t *vd;
9565 	boolean_t sync = B_FALSE;
9566 
9567 	ASSERT(spa_writeable(spa));
9568 
9569 	spa_vdev_state_enter(spa, SCL_ALL);
9570 
9571 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
9572 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
9573 
9574 	if (!vd->vdev_ops->vdev_op_leaf)
9575 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
9576 
9577 	if (ispath) {
9578 		if (strcmp(value, vd->vdev_path) != 0) {
9579 			spa_strfree(vd->vdev_path);
9580 			vd->vdev_path = spa_strdup(value);
9581 			sync = B_TRUE;
9582 		}
9583 	} else {
9584 		if (vd->vdev_fru == NULL) {
9585 			vd->vdev_fru = spa_strdup(value);
9586 			sync = B_TRUE;
9587 		} else if (strcmp(value, vd->vdev_fru) != 0) {
9588 			spa_strfree(vd->vdev_fru);
9589 			vd->vdev_fru = spa_strdup(value);
9590 			sync = B_TRUE;
9591 		}
9592 	}
9593 
9594 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
9595 }
9596 
9597 int
9598 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
9599 {
9600 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
9601 }
9602 
9603 int
9604 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
9605 {
9606 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
9607 }
9608 
9609 /*
9610  * ==========================================================================
9611  * SPA Scanning
9612  * ==========================================================================
9613  */
9614 int
9615 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
9616 {
9617 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
9618 
9619 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
9620 		return (SET_ERROR(EBUSY));
9621 
9622 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
9623 }
9624 
9625 int
9626 spa_scan_stop(spa_t *spa)
9627 {
9628 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
9629 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
9630 		return (SET_ERROR(EBUSY));
9631 
9632 	return (dsl_scan_cancel(spa->spa_dsl_pool));
9633 }
9634 
9635 int
9636 spa_scan(spa_t *spa, pool_scan_func_t func)
9637 {
9638 	return (spa_scan_range(spa, func, 0, 0));
9639 }
9640 
9641 int
9642 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart,
9643     uint64_t txgend)
9644 {
9645 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
9646 
9647 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
9648 		return (SET_ERROR(ENOTSUP));
9649 
9650 	if (func == POOL_SCAN_RESILVER &&
9651 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
9652 		return (SET_ERROR(ENOTSUP));
9653 
9654 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0))
9655 		return (SET_ERROR(ENOTSUP));
9656 
9657 	/*
9658 	 * If a resilver was requested, but there is no DTL on a
9659 	 * writeable leaf device, we have nothing to do.
9660 	 */
9661 	if (func == POOL_SCAN_RESILVER &&
9662 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
9663 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
9664 		return (0);
9665 	}
9666 
9667 	if (func == POOL_SCAN_ERRORSCRUB &&
9668 	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
9669 		return (SET_ERROR(ENOTSUP));
9670 
9671 	return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend));
9672 }
9673 
9674 /*
9675  * ==========================================================================
9676  * SPA async task processing
9677  * ==========================================================================
9678  */
9679 
9680 static void
9681 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
9682 {
9683 	if (vd->vdev_remove_wanted) {
9684 		vd->vdev_remove_wanted = B_FALSE;
9685 		vd->vdev_delayed_close = B_FALSE;
9686 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
9687 
9688 		/*
9689 		 * We want to clear the stats, but we don't want to do a full
9690 		 * vdev_clear() as that will cause us to throw away
9691 		 * degraded/faulted state as well as attempt to reopen the
9692 		 * device, all of which is a waste.
9693 		 */
9694 		vd->vdev_stat.vs_read_errors = 0;
9695 		vd->vdev_stat.vs_write_errors = 0;
9696 		vd->vdev_stat.vs_checksum_errors = 0;
9697 
9698 		vdev_state_dirty(vd->vdev_top);
9699 
9700 		/* Tell userspace that the vdev is gone. */
9701 		zfs_post_remove(spa, vd, by_kernel);
9702 	}
9703 
9704 	for (int c = 0; c < vd->vdev_children; c++)
9705 		spa_async_remove(spa, vd->vdev_child[c], by_kernel);
9706 }
9707 
9708 static void
9709 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
9710 {
9711 	if (vd->vdev_fault_wanted) {
9712 		vdev_state_t newstate = VDEV_STATE_FAULTED;
9713 		vd->vdev_fault_wanted = B_FALSE;
9714 
9715 		/*
9716 		 * If this device has the only valid copy of the data, then
9717 		 * back off and simply mark the vdev as degraded instead.
9718 		 */
9719 		if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
9720 		    vdev_dtl_required(vd)) {
9721 			newstate = VDEV_STATE_DEGRADED;
9722 			/* A required disk is missing so suspend the pool */
9723 			*suspend = B_TRUE;
9724 		}
9725 		vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
9726 	}
9727 	for (int c = 0; c < vd->vdev_children; c++)
9728 		spa_async_fault_vdev(vd->vdev_child[c], suspend);
9729 }
9730 
9731 static void
9732 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
9733 {
9734 	if (!spa->spa_autoexpand)
9735 		return;
9736 
9737 	for (int c = 0; c < vd->vdev_children; c++) {
9738 		vdev_t *cvd = vd->vdev_child[c];
9739 		spa_async_autoexpand(spa, cvd);
9740 	}
9741 
9742 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
9743 		return;
9744 
9745 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
9746 }
9747 
9748 static __attribute__((noreturn)) void
9749 spa_async_thread(void *arg)
9750 {
9751 	spa_t *spa = (spa_t *)arg;
9752 	dsl_pool_t *dp = spa->spa_dsl_pool;
9753 	int tasks;
9754 
9755 	ASSERT(spa->spa_sync_on);
9756 
9757 	mutex_enter(&spa->spa_async_lock);
9758 	tasks = spa->spa_async_tasks;
9759 	spa->spa_async_tasks = 0;
9760 	mutex_exit(&spa->spa_async_lock);
9761 
9762 	/*
9763 	 * See if the config needs to be updated.
9764 	 */
9765 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
9766 		uint64_t old_space, new_space;
9767 
9768 		spa_namespace_enter(FTAG);
9769 		old_space = metaslab_class_get_space(spa_normal_class(spa));
9770 		old_space += metaslab_class_get_space(spa_special_class(spa));
9771 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
9772 		old_space += metaslab_class_get_space(
9773 		    spa_embedded_log_class(spa));
9774 		old_space += metaslab_class_get_space(
9775 		    spa_special_embedded_log_class(spa));
9776 
9777 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
9778 
9779 		new_space = metaslab_class_get_space(spa_normal_class(spa));
9780 		new_space += metaslab_class_get_space(spa_special_class(spa));
9781 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
9782 		new_space += metaslab_class_get_space(
9783 		    spa_embedded_log_class(spa));
9784 		new_space += metaslab_class_get_space(
9785 		    spa_special_embedded_log_class(spa));
9786 		spa_namespace_exit(FTAG);
9787 
9788 		/*
9789 		 * If the pool grew as a result of the config update,
9790 		 * then log an internal history event.
9791 		 */
9792 		if (new_space != old_space) {
9793 			spa_history_log_internal(spa, "vdev online", NULL,
9794 			    "pool '%s' size: %llu(+%llu)",
9795 			    spa_name(spa), (u_longlong_t)new_space,
9796 			    (u_longlong_t)(new_space - old_space));
9797 		}
9798 	}
9799 
9800 	/*
9801 	 * See if any devices need to be marked REMOVED.
9802 	 */
9803 	if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) {
9804 		boolean_t by_kernel = B_TRUE;
9805 		if (tasks & SPA_ASYNC_REMOVE_BY_USER)
9806 			by_kernel = B_FALSE;
9807 		spa_vdev_state_enter(spa, SCL_NONE);
9808 		spa_async_remove(spa, spa->spa_root_vdev, by_kernel);
9809 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
9810 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i],
9811 			    by_kernel);
9812 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
9813 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i],
9814 			    by_kernel);
9815 		(void) spa_vdev_state_exit(spa, NULL, 0);
9816 	}
9817 
9818 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
9819 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9820 		spa_async_autoexpand(spa, spa->spa_root_vdev);
9821 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9822 	}
9823 
9824 	/*
9825 	 * See if any devices need to be marked faulted.
9826 	 */
9827 	if (tasks & SPA_ASYNC_FAULT_VDEV) {
9828 		spa_vdev_state_enter(spa, SCL_NONE);
9829 		boolean_t suspend = B_FALSE;
9830 		spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
9831 		(void) spa_vdev_state_exit(spa, NULL, 0);
9832 		if (suspend)
9833 			zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
9834 	}
9835 
9836 	/*
9837 	 * If any devices are done replacing, detach them.
9838 	 */
9839 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
9840 	    tasks & SPA_ASYNC_REBUILD_DONE ||
9841 	    tasks & SPA_ASYNC_DETACH_SPARE) {
9842 		spa_vdev_resilver_done(spa);
9843 	}
9844 
9845 	/*
9846 	 * Kick off a resilver.
9847 	 */
9848 	if (tasks & SPA_ASYNC_RESILVER &&
9849 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
9850 	    (!dsl_scan_resilvering(dp) ||
9851 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
9852 		dsl_scan_restart_resilver(dp, 0);
9853 
9854 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
9855 		spa_namespace_enter(FTAG);
9856 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9857 		vdev_initialize_restart(spa->spa_root_vdev);
9858 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9859 		spa_namespace_exit(FTAG);
9860 	}
9861 
9862 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
9863 		spa_namespace_enter(FTAG);
9864 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9865 		vdev_trim_restart(spa->spa_root_vdev);
9866 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9867 		spa_namespace_exit(FTAG);
9868 	}
9869 
9870 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
9871 		spa_namespace_enter(FTAG);
9872 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9873 		vdev_autotrim_restart(spa);
9874 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9875 		spa_namespace_exit(FTAG);
9876 	}
9877 
9878 	/*
9879 	 * Kick off L2 cache whole device TRIM.
9880 	 */
9881 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
9882 		spa_namespace_enter(FTAG);
9883 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9884 		vdev_trim_l2arc(spa);
9885 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9886 		spa_namespace_exit(FTAG);
9887 	}
9888 
9889 	/*
9890 	 * Kick off L2 cache rebuilding.
9891 	 */
9892 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
9893 		spa_namespace_enter(FTAG);
9894 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
9895 		l2arc_spa_rebuild_start(spa);
9896 		spa_config_exit(spa, SCL_L2ARC, FTAG);
9897 		spa_namespace_exit(FTAG);
9898 	}
9899 
9900 	/*
9901 	 * Let the world know that we're done.
9902 	 */
9903 	mutex_enter(&spa->spa_async_lock);
9904 	spa->spa_async_thread = NULL;
9905 	cv_broadcast(&spa->spa_async_cv);
9906 	mutex_exit(&spa->spa_async_lock);
9907 	thread_exit();
9908 }
9909 
9910 void
9911 spa_async_suspend(spa_t *spa)
9912 {
9913 	mutex_enter(&spa->spa_async_lock);
9914 	spa->spa_async_suspended++;
9915 	while (spa->spa_async_thread != NULL)
9916 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
9917 	mutex_exit(&spa->spa_async_lock);
9918 
9919 	spa_vdev_remove_suspend(spa);
9920 
9921 	zthr_t *condense_thread = spa->spa_condense_zthr;
9922 	if (condense_thread != NULL)
9923 		zthr_cancel(condense_thread);
9924 
9925 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
9926 	if (raidz_expand_thread != NULL)
9927 		zthr_cancel(raidz_expand_thread);
9928 
9929 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
9930 	if (discard_thread != NULL)
9931 		zthr_cancel(discard_thread);
9932 
9933 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
9934 	if (ll_delete_thread != NULL)
9935 		zthr_cancel(ll_delete_thread);
9936 
9937 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
9938 	if (ll_condense_thread != NULL)
9939 		zthr_cancel(ll_condense_thread);
9940 }
9941 
9942 void
9943 spa_async_resume(spa_t *spa)
9944 {
9945 	mutex_enter(&spa->spa_async_lock);
9946 	ASSERT(spa->spa_async_suspended != 0);
9947 	spa->spa_async_suspended--;
9948 	mutex_exit(&spa->spa_async_lock);
9949 	spa_restart_removal(spa);
9950 
9951 	zthr_t *condense_thread = spa->spa_condense_zthr;
9952 	if (condense_thread != NULL)
9953 		zthr_resume(condense_thread);
9954 
9955 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
9956 	if (raidz_expand_thread != NULL)
9957 		zthr_resume(raidz_expand_thread);
9958 
9959 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
9960 	if (discard_thread != NULL)
9961 		zthr_resume(discard_thread);
9962 
9963 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
9964 	if (ll_delete_thread != NULL)
9965 		zthr_resume(ll_delete_thread);
9966 
9967 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
9968 	if (ll_condense_thread != NULL)
9969 		zthr_resume(ll_condense_thread);
9970 }
9971 
9972 static boolean_t
9973 spa_async_tasks_pending(spa_t *spa)
9974 {
9975 	uint_t non_config_tasks;
9976 	uint_t config_task;
9977 	boolean_t config_task_suspended;
9978 
9979 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
9980 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
9981 	if (spa->spa_ccw_fail_time == 0) {
9982 		config_task_suspended = B_FALSE;
9983 	} else {
9984 		config_task_suspended =
9985 		    (gethrtime() - spa->spa_ccw_fail_time) <
9986 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
9987 	}
9988 
9989 	return (non_config_tasks || (config_task && !config_task_suspended));
9990 }
9991 
9992 static void
9993 spa_async_dispatch(spa_t *spa)
9994 {
9995 	mutex_enter(&spa->spa_async_lock);
9996 	if (spa_async_tasks_pending(spa) &&
9997 	    !spa->spa_async_suspended &&
9998 	    spa->spa_async_thread == NULL)
9999 		spa->spa_async_thread = thread_create(NULL, 0,
10000 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
10001 	mutex_exit(&spa->spa_async_lock);
10002 }
10003 
10004 void
10005 spa_async_request(spa_t *spa, int task)
10006 {
10007 	zfs_dbgmsg("spa=%s async request task=%u", spa_load_name(spa), task);
10008 	mutex_enter(&spa->spa_async_lock);
10009 	spa->spa_async_tasks |= task;
10010 	mutex_exit(&spa->spa_async_lock);
10011 }
10012 
10013 int
10014 spa_async_tasks(spa_t *spa)
10015 {
10016 	return (spa->spa_async_tasks);
10017 }
10018 
10019 /*
10020  * ==========================================================================
10021  * SPA syncing routines
10022  * ==========================================================================
10023  */
10024 
10025 
10026 static int
10027 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
10028     dmu_tx_t *tx)
10029 {
10030 	bpobj_t *bpo = arg;
10031 	bpobj_enqueue(bpo, bp, bp_freed, tx);
10032 	return (0);
10033 }
10034 
10035 int
10036 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
10037 {
10038 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
10039 }
10040 
10041 int
10042 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
10043 {
10044 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
10045 }
10046 
10047 static int
10048 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
10049 {
10050 	zio_t *pio = arg;
10051 
10052 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
10053 	    pio->io_flags));
10054 	return (0);
10055 }
10056 
10057 static int
10058 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
10059     dmu_tx_t *tx)
10060 {
10061 	ASSERT(!bp_freed);
10062 	return (spa_free_sync_cb(arg, bp, tx));
10063 }
10064 
10065 /*
10066  * Note: this simple function is not inlined to make it easier to dtrace the
10067  * amount of time spent syncing frees.
10068  */
10069 static void
10070 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
10071 {
10072 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
10073 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
10074 	VERIFY0(zio_wait(zio));
10075 }
10076 
10077 /*
10078  * Note: this simple function is not inlined to make it easier to dtrace the
10079  * amount of time spent syncing deferred frees.
10080  */
10081 static void
10082 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
10083 {
10084 	if (spa_sync_pass(spa) != 1)
10085 		return;
10086 
10087 	/*
10088 	 * Note:
10089 	 * If the log space map feature is active, we stop deferring
10090 	 * frees to the next TXG and therefore running this function
10091 	 * would be considered a no-op as spa_deferred_bpobj should
10092 	 * not have any entries.
10093 	 *
10094 	 * That said we run this function anyway (instead of returning
10095 	 * immediately) for the edge-case scenario where we just
10096 	 * activated the log space map feature in this TXG but we have
10097 	 * deferred frees from the previous TXG.
10098 	 */
10099 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
10100 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
10101 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
10102 	VERIFY0(zio_wait(zio));
10103 }
10104 
10105 static void
10106 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
10107 {
10108 	char *packed = NULL;
10109 	size_t bufsize;
10110 	size_t nvsize = 0;
10111 	dmu_buf_t *db;
10112 
10113 	VERIFY0(nvlist_size(nv, &nvsize, NV_ENCODE_XDR));
10114 
10115 	/*
10116 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
10117 	 * information.  This avoids the dmu_buf_will_dirty() path and
10118 	 * saves us a pre-read to get data we don't actually care about.
10119 	 */
10120 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
10121 	packed = vmem_alloc(bufsize, KM_SLEEP);
10122 
10123 	VERIFY0(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
10124 	    KM_SLEEP));
10125 	memset(packed + nvsize, 0, bufsize - nvsize);
10126 
10127 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx,
10128 	    DMU_READ_NO_PREFETCH);
10129 
10130 	vmem_free(packed, bufsize);
10131 
10132 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
10133 	dmu_buf_will_dirty(db, tx);
10134 	*(uint64_t *)db->db_data = nvsize;
10135 	dmu_buf_rele(db, FTAG);
10136 }
10137 
10138 static void
10139 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
10140     const char *config, const char *entry)
10141 {
10142 	nvlist_t *nvroot;
10143 	nvlist_t **list;
10144 	int i;
10145 
10146 	if (!sav->sav_sync)
10147 		return;
10148 
10149 	/*
10150 	 * Update the MOS nvlist describing the list of available devices.
10151 	 * spa_validate_aux() will have already made sure this nvlist is
10152 	 * valid and the vdevs are labeled appropriately.
10153 	 */
10154 	if (sav->sav_object == 0) {
10155 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
10156 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
10157 		    sizeof (uint64_t), tx);
10158 		VERIFY(zap_update(spa->spa_meta_objset,
10159 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
10160 		    &sav->sav_object, tx) == 0);
10161 	}
10162 
10163 	nvroot = fnvlist_alloc();
10164 	if (sav->sav_count == 0) {
10165 		fnvlist_add_nvlist_array(nvroot, config,
10166 		    (const nvlist_t * const *)NULL, 0);
10167 	} else {
10168 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
10169 		for (i = 0; i < sav->sav_count; i++)
10170 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
10171 			    B_FALSE, VDEV_CONFIG_L2CACHE);
10172 		fnvlist_add_nvlist_array(nvroot, config,
10173 		    (const nvlist_t * const *)list, sav->sav_count);
10174 		for (i = 0; i < sav->sav_count; i++)
10175 			nvlist_free(list[i]);
10176 		kmem_free(list, sav->sav_count * sizeof (void *));
10177 	}
10178 
10179 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
10180 	nvlist_free(nvroot);
10181 
10182 	sav->sav_sync = B_FALSE;
10183 }
10184 
10185 /*
10186  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
10187  * The all-vdev ZAP must be empty.
10188  */
10189 static void
10190 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
10191 {
10192 	spa_t *spa = vd->vdev_spa;
10193 
10194 	if (vd->vdev_root_zap != 0 &&
10195 	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
10196 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
10197 		    vd->vdev_root_zap, tx));
10198 	}
10199 	if (vd->vdev_top_zap != 0) {
10200 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
10201 		    vd->vdev_top_zap, tx));
10202 	}
10203 	if (vd->vdev_leaf_zap != 0) {
10204 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
10205 		    vd->vdev_leaf_zap, tx));
10206 	}
10207 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
10208 		spa_avz_build(vd->vdev_child[i], avz, tx);
10209 	}
10210 }
10211 
10212 static void
10213 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
10214 {
10215 	nvlist_t *config;
10216 
10217 	/*
10218 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
10219 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
10220 	 * Similarly, if the pool is being assembled (e.g. after a split), we
10221 	 * need to rebuild the AVZ although the config may not be dirty.
10222 	 */
10223 	if (list_is_empty(&spa->spa_config_dirty_list) &&
10224 	    spa->spa_avz_action == AVZ_ACTION_NONE)
10225 		return;
10226 
10227 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
10228 
10229 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
10230 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
10231 	    spa->spa_all_vdev_zaps != 0);
10232 
10233 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
10234 		/* Make and build the new AVZ */
10235 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
10236 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
10237 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
10238 
10239 		/* Diff old AVZ with new one */
10240 		zap_cursor_t zc;
10241 		zap_attribute_t *za = zap_attribute_alloc();
10242 
10243 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
10244 		    spa->spa_all_vdev_zaps);
10245 		    zap_cursor_retrieve(&zc, za) == 0;
10246 		    zap_cursor_advance(&zc)) {
10247 			uint64_t vdzap = za->za_first_integer;
10248 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
10249 			    vdzap) == ENOENT) {
10250 				/*
10251 				 * ZAP is listed in old AVZ but not in new one;
10252 				 * destroy it
10253 				 */
10254 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
10255 				    tx));
10256 			}
10257 		}
10258 
10259 		zap_cursor_fini(&zc);
10260 		zap_attribute_free(za);
10261 
10262 		/* Destroy the old AVZ */
10263 		VERIFY0(zap_destroy(spa->spa_meta_objset,
10264 		    spa->spa_all_vdev_zaps, tx));
10265 
10266 		/* Replace the old AVZ in the dir obj with the new one */
10267 		VERIFY0(zap_update(spa->spa_meta_objset,
10268 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
10269 		    sizeof (new_avz), 1, &new_avz, tx));
10270 
10271 		spa->spa_all_vdev_zaps = new_avz;
10272 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
10273 		zap_cursor_t zc;
10274 		zap_attribute_t *za = zap_attribute_alloc();
10275 
10276 		/* Walk through the AVZ and destroy all listed ZAPs */
10277 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
10278 		    spa->spa_all_vdev_zaps);
10279 		    zap_cursor_retrieve(&zc, za) == 0;
10280 		    zap_cursor_advance(&zc)) {
10281 			uint64_t zap = za->za_first_integer;
10282 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
10283 		}
10284 
10285 		zap_cursor_fini(&zc);
10286 		zap_attribute_free(za);
10287 
10288 		/* Destroy and unlink the AVZ itself */
10289 		VERIFY0(zap_destroy(spa->spa_meta_objset,
10290 		    spa->spa_all_vdev_zaps, tx));
10291 		VERIFY0(zap_remove(spa->spa_meta_objset,
10292 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
10293 		spa->spa_all_vdev_zaps = 0;
10294 	}
10295 
10296 	if (spa->spa_all_vdev_zaps == 0) {
10297 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
10298 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
10299 		    DMU_POOL_VDEV_ZAP_MAP, tx);
10300 	}
10301 	spa->spa_avz_action = AVZ_ACTION_NONE;
10302 
10303 	/* Create ZAPs for vdevs that don't have them. */
10304 	vdev_construct_zaps(spa->spa_root_vdev, tx);
10305 
10306 	config = spa_config_generate(spa, spa->spa_root_vdev,
10307 	    dmu_tx_get_txg(tx), B_FALSE);
10308 
10309 	/*
10310 	 * If we're upgrading the spa version then make sure that
10311 	 * the config object gets updated with the correct version.
10312 	 */
10313 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
10314 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
10315 		    spa->spa_uberblock.ub_version);
10316 
10317 	spa_config_exit(spa, SCL_STATE, FTAG);
10318 
10319 	nvlist_free(spa->spa_config_syncing);
10320 	spa->spa_config_syncing = config;
10321 
10322 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
10323 }
10324 
10325 static void
10326 spa_sync_version(void *arg, dmu_tx_t *tx)
10327 {
10328 	uint64_t *versionp = arg;
10329 	uint64_t version = *versionp;
10330 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
10331 
10332 	/*
10333 	 * Setting the version is special cased when first creating the pool.
10334 	 */
10335 	ASSERT(tx->tx_txg != TXG_INITIAL);
10336 
10337 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
10338 	ASSERT(version >= spa_version(spa));
10339 
10340 	spa->spa_uberblock.ub_version = version;
10341 	vdev_config_dirty(spa->spa_root_vdev);
10342 	spa_history_log_internal(spa, "set", tx, "version=%lld",
10343 	    (longlong_t)version);
10344 }
10345 
10346 /*
10347  * Set zpool properties.
10348  */
10349 static void
10350 spa_sync_props(void *arg, dmu_tx_t *tx)
10351 {
10352 	nvlist_t *nvp = arg;
10353 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
10354 	objset_t *mos = spa->spa_meta_objset;
10355 	nvpair_t *elem = NULL;
10356 
10357 	mutex_enter(&spa->spa_props_lock);
10358 
10359 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
10360 		uint64_t intval;
10361 		const char *strval, *fname;
10362 		zpool_prop_t prop;
10363 		const char *propname;
10364 		const char *elemname = nvpair_name(elem);
10365 		zprop_type_t proptype;
10366 		spa_feature_t fid;
10367 
10368 		switch (prop = zpool_name_to_prop(elemname)) {
10369 		case ZPOOL_PROP_VERSION:
10370 			intval = fnvpair_value_uint64(elem);
10371 			/*
10372 			 * The version is synced separately before other
10373 			 * properties and should be correct by now.
10374 			 */
10375 			ASSERT3U(spa_version(spa), >=, intval);
10376 			break;
10377 
10378 		case ZPOOL_PROP_ALTROOT:
10379 			/*
10380 			 * 'altroot' is a non-persistent property. It should
10381 			 * have been set temporarily at creation or import time.
10382 			 */
10383 			ASSERT(spa->spa_root != NULL);
10384 			break;
10385 
10386 		case ZPOOL_PROP_READONLY:
10387 		case ZPOOL_PROP_CACHEFILE:
10388 			/*
10389 			 * 'readonly' and 'cachefile' are also non-persistent
10390 			 * properties.
10391 			 */
10392 			break;
10393 		case ZPOOL_PROP_COMMENT:
10394 			strval = fnvpair_value_string(elem);
10395 			if (spa->spa_comment != NULL)
10396 				spa_strfree(spa->spa_comment);
10397 			spa->spa_comment = spa_strdup(strval);
10398 			/*
10399 			 * We need to dirty the configuration on all the vdevs
10400 			 * so that their labels get updated.  We also need to
10401 			 * update the cache file to keep it in sync with the
10402 			 * MOS version. It's unnecessary to do this for pool
10403 			 * creation since the vdev's configuration has already
10404 			 * been dirtied.
10405 			 */
10406 			if (tx->tx_txg != TXG_INITIAL) {
10407 				vdev_config_dirty(spa->spa_root_vdev);
10408 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
10409 			}
10410 			spa_history_log_internal(spa, "set", tx,
10411 			    "%s=%s", elemname, strval);
10412 			break;
10413 		case ZPOOL_PROP_COMPATIBILITY:
10414 			strval = fnvpair_value_string(elem);
10415 			if (spa->spa_compatibility != NULL)
10416 				spa_strfree(spa->spa_compatibility);
10417 			spa->spa_compatibility = spa_strdup(strval);
10418 			/*
10419 			 * Dirty the configuration on vdevs as above.
10420 			 */
10421 			if (tx->tx_txg != TXG_INITIAL) {
10422 				vdev_config_dirty(spa->spa_root_vdev);
10423 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
10424 			}
10425 
10426 			spa_history_log_internal(spa, "set", tx,
10427 			    "%s=%s", nvpair_name(elem), strval);
10428 			break;
10429 
10430 		case ZPOOL_PROP_INVAL:
10431 			if (zpool_prop_feature(elemname)) {
10432 				fname = strchr(elemname, '@') + 1;
10433 				VERIFY0(zfeature_lookup_name(fname, &fid));
10434 
10435 				spa_feature_enable(spa, fid, tx);
10436 				spa_history_log_internal(spa, "set", tx,
10437 				    "%s=enabled", elemname);
10438 				break;
10439 			} else if (!zfs_prop_user(elemname)) {
10440 				ASSERT(zpool_prop_feature(elemname));
10441 				break;
10442 			}
10443 			zfs_fallthrough;
10444 		default:
10445 			/*
10446 			 * Set pool property values in the poolprops mos object.
10447 			 */
10448 			if (spa->spa_pool_props_object == 0) {
10449 				spa->spa_pool_props_object =
10450 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
10451 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
10452 				    tx);
10453 			}
10454 
10455 			/* normalize the property name */
10456 			if (prop == ZPOOL_PROP_INVAL) {
10457 				propname = elemname;
10458 				proptype = PROP_TYPE_STRING;
10459 			} else {
10460 				propname = zpool_prop_to_name(prop);
10461 				proptype = zpool_prop_get_type(prop);
10462 			}
10463 
10464 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
10465 				ASSERT(proptype == PROP_TYPE_STRING);
10466 				strval = fnvpair_value_string(elem);
10467 				if (strlen(strval) == 0) {
10468 					/* remove the property if value == "" */
10469 					(void) zap_remove(mos,
10470 					    spa->spa_pool_props_object,
10471 					    propname, tx);
10472 				} else {
10473 					VERIFY0(zap_update(mos,
10474 					    spa->spa_pool_props_object,
10475 					    propname, 1, strlen(strval) + 1,
10476 					    strval, tx));
10477 				}
10478 				spa_history_log_internal(spa, "set", tx,
10479 				    "%s=%s", elemname, strval);
10480 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
10481 				intval = fnvpair_value_uint64(elem);
10482 
10483 				if (proptype == PROP_TYPE_INDEX) {
10484 					const char *unused;
10485 					VERIFY0(zpool_prop_index_to_string(
10486 					    prop, intval, &unused));
10487 				}
10488 				VERIFY0(zap_update(mos,
10489 				    spa->spa_pool_props_object, propname,
10490 				    8, 1, &intval, tx));
10491 				spa_history_log_internal(spa, "set", tx,
10492 				    "%s=%lld", elemname,
10493 				    (longlong_t)intval);
10494 
10495 				switch (prop) {
10496 				case ZPOOL_PROP_DELEGATION:
10497 					spa->spa_delegation = intval;
10498 					break;
10499 				case ZPOOL_PROP_BOOTFS:
10500 					spa->spa_bootfs = intval;
10501 					break;
10502 				case ZPOOL_PROP_FAILUREMODE:
10503 					spa->spa_failmode = intval;
10504 					break;
10505 				case ZPOOL_PROP_AUTOTRIM:
10506 					spa->spa_autotrim = intval;
10507 					spa_async_request(spa,
10508 					    SPA_ASYNC_AUTOTRIM_RESTART);
10509 					break;
10510 				case ZPOOL_PROP_AUTOEXPAND:
10511 					spa->spa_autoexpand = intval;
10512 					if (tx->tx_txg != TXG_INITIAL)
10513 						spa_async_request(spa,
10514 						    SPA_ASYNC_AUTOEXPAND);
10515 					break;
10516 				case ZPOOL_PROP_MULTIHOST:
10517 					spa->spa_multihost = intval;
10518 					break;
10519 				case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
10520 					spa->spa_dedup_table_quota = intval;
10521 					break;
10522 				default:
10523 					break;
10524 				}
10525 			} else {
10526 				ASSERT(0); /* not allowed */
10527 			}
10528 		}
10529 
10530 	}
10531 
10532 	mutex_exit(&spa->spa_props_lock);
10533 }
10534 
10535 /*
10536  * Perform one-time upgrade on-disk changes.  spa_version() does not
10537  * reflect the new version this txg, so there must be no changes this
10538  * txg to anything that the upgrade code depends on after it executes.
10539  * Therefore this must be called after dsl_pool_sync() does the sync
10540  * tasks.
10541  */
10542 static void
10543 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
10544 {
10545 	if (spa_sync_pass(spa) != 1)
10546 		return;
10547 
10548 	dsl_pool_t *dp = spa->spa_dsl_pool;
10549 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
10550 
10551 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
10552 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
10553 		dsl_pool_create_origin(dp, tx);
10554 
10555 		/* Keeping the origin open increases spa_minref */
10556 		spa->spa_minref += 3;
10557 	}
10558 
10559 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
10560 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
10561 		dsl_pool_upgrade_clones(dp, tx);
10562 	}
10563 
10564 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
10565 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
10566 		dsl_pool_upgrade_dir_clones(dp, tx);
10567 
10568 		/* Keeping the freedir open increases spa_minref */
10569 		spa->spa_minref += 3;
10570 	}
10571 
10572 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
10573 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
10574 		spa_feature_create_zap_objects(spa, tx);
10575 	}
10576 
10577 	/*
10578 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
10579 	 * when possibility to use lz4 compression for metadata was added
10580 	 * Old pools that have this feature enabled must be upgraded to have
10581 	 * this feature active
10582 	 */
10583 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
10584 		boolean_t lz4_en = spa_feature_is_enabled(spa,
10585 		    SPA_FEATURE_LZ4_COMPRESS);
10586 		boolean_t lz4_ac = spa_feature_is_active(spa,
10587 		    SPA_FEATURE_LZ4_COMPRESS);
10588 
10589 		if (lz4_en && !lz4_ac)
10590 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
10591 	}
10592 
10593 	/*
10594 	 * If we haven't written the salt, do so now.  Note that the
10595 	 * feature may not be activated yet, but that's fine since
10596 	 * the presence of this ZAP entry is backwards compatible.
10597 	 */
10598 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
10599 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
10600 		VERIFY0(zap_add(spa->spa_meta_objset,
10601 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
10602 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
10603 		    spa->spa_cksum_salt.zcs_bytes, tx));
10604 	}
10605 
10606 	rrw_exit(&dp->dp_config_rwlock, FTAG);
10607 }
10608 
10609 static void
10610 vdev_indirect_state_sync_verify(vdev_t *vd)
10611 {
10612 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
10613 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
10614 
10615 	if (vd->vdev_ops == &vdev_indirect_ops) {
10616 		ASSERT(vim != NULL);
10617 		ASSERT(vib != NULL);
10618 	}
10619 
10620 	uint64_t obsolete_sm_object = 0;
10621 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
10622 	if (obsolete_sm_object != 0) {
10623 		ASSERT(vd->vdev_obsolete_sm != NULL);
10624 		ASSERT(vd->vdev_removing ||
10625 		    vd->vdev_ops == &vdev_indirect_ops);
10626 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
10627 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
10628 		ASSERT3U(obsolete_sm_object, ==,
10629 		    space_map_object(vd->vdev_obsolete_sm));
10630 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
10631 		    space_map_allocated(vd->vdev_obsolete_sm));
10632 	}
10633 	ASSERT(vd->vdev_obsolete_segments != NULL);
10634 
10635 	/*
10636 	 * Since frees / remaps to an indirect vdev can only
10637 	 * happen in syncing context, the obsolete segments
10638 	 * tree must be empty when we start syncing.
10639 	 */
10640 	ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments));
10641 }
10642 
10643 /*
10644  * Set the top-level vdev's max queue depth. Evaluate each top-level's
10645  * async write queue depth in case it changed. The max queue depth will
10646  * not change in the middle of syncing out this txg.
10647  */
10648 static void
10649 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
10650 {
10651 	ASSERT(spa_writeable(spa));
10652 
10653 	metaslab_class_balance(spa_normal_class(spa), B_TRUE);
10654 	metaslab_class_balance(spa_special_class(spa), B_TRUE);
10655 	metaslab_class_balance(spa_dedup_class(spa), B_TRUE);
10656 }
10657 
10658 static void
10659 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
10660 {
10661 	ASSERT(spa_writeable(spa));
10662 
10663 	vdev_t *rvd = spa->spa_root_vdev;
10664 	for (int c = 0; c < rvd->vdev_children; c++) {
10665 		vdev_t *vd = rvd->vdev_child[c];
10666 		vdev_indirect_state_sync_verify(vd);
10667 
10668 		if (vdev_indirect_should_condense(vd)) {
10669 			spa_condense_indirect_start_sync(vd, tx);
10670 			break;
10671 		}
10672 	}
10673 }
10674 
10675 static void
10676 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
10677 {
10678 	objset_t *mos = spa->spa_meta_objset;
10679 	dsl_pool_t *dp = spa->spa_dsl_pool;
10680 	uint64_t txg = tx->tx_txg;
10681 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
10682 
10683 	do {
10684 		int pass = ++spa->spa_sync_pass;
10685 
10686 		spa_sync_config_object(spa, tx);
10687 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
10688 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
10689 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
10690 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
10691 		spa_errlog_sync(spa, txg);
10692 		dsl_pool_sync(dp, txg);
10693 
10694 		if (pass < zfs_sync_pass_deferred_free ||
10695 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
10696 			/*
10697 			 * If the log space map feature is active we don't
10698 			 * care about deferred frees and the deferred bpobj
10699 			 * as the log space map should effectively have the
10700 			 * same results (i.e. appending only to one object).
10701 			 */
10702 			spa_sync_frees(spa, free_bpl, tx);
10703 		} else {
10704 			/*
10705 			 * We can not defer frees in pass 1, because
10706 			 * we sync the deferred frees later in pass 1.
10707 			 */
10708 			ASSERT3U(pass, >, 1);
10709 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
10710 			    &spa->spa_deferred_bpobj, tx);
10711 		}
10712 
10713 		brt_sync(spa, txg);
10714 		ddt_sync(spa, txg);
10715 		dsl_scan_sync(dp, tx);
10716 		dsl_errorscrub_sync(dp, tx);
10717 		svr_sync(spa, tx);
10718 		spa_sync_upgrades(spa, tx);
10719 
10720 		spa_flush_metaslabs(spa, tx);
10721 
10722 		vdev_t *vd = NULL;
10723 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
10724 		    != NULL)
10725 			vdev_sync(vd, txg);
10726 
10727 		if (pass == 1) {
10728 			/*
10729 			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
10730 			 * the config. If that happens, this txg should not
10731 			 * be a no-op. So we must sync the config to the MOS
10732 			 * before checking for no-op.
10733 			 *
10734 			 * Note that when the config is dirty, it will
10735 			 * be written to the MOS (i.e. the MOS will be
10736 			 * dirtied) every time we call spa_sync_config_object()
10737 			 * in this txg.  Therefore we can't call this after
10738 			 * dsl_pool_sync() every pass, because it would
10739 			 * prevent us from converging, since we'd dirty
10740 			 * the MOS every pass.
10741 			 *
10742 			 * Sync tasks can only be processed in pass 1, so
10743 			 * there's no need to do this in later passes.
10744 			 */
10745 			spa_sync_config_object(spa, tx);
10746 		}
10747 
10748 		/*
10749 		 * Note: We need to check if the MOS is dirty because we could
10750 		 * have marked the MOS dirty without updating the uberblock
10751 		 * (e.g. if we have sync tasks but no dirty user data). We need
10752 		 * to check the uberblock's rootbp because it is updated if we
10753 		 * have synced out dirty data (though in this case the MOS will
10754 		 * most likely also be dirty due to second order effects, we
10755 		 * don't want to rely on that here).
10756 		 */
10757 		if (pass == 1 &&
10758 		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
10759 		    !dmu_objset_is_dirty(mos, txg)) {
10760 			/*
10761 			 * Nothing changed on the first pass, therefore this
10762 			 * TXG is a no-op. Avoid syncing deferred frees, so
10763 			 * that we can keep this TXG as a no-op.
10764 			 */
10765 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
10766 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
10767 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
10768 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
10769 			break;
10770 		}
10771 
10772 		spa_sync_deferred_frees(spa, tx);
10773 	} while (dmu_objset_is_dirty(mos, txg));
10774 }
10775 
10776 /*
10777  * Rewrite the vdev configuration (which includes the uberblock) to
10778  * commit the transaction group.
10779  *
10780  * If there are no dirty vdevs, we sync the uberblock to a few random
10781  * top-level vdevs that are known to be visible in the config cache
10782  * (see spa_vdev_add() for a complete description). If there *are* dirty
10783  * vdevs, sync the uberblock to all vdevs.
10784  */
10785 static void
10786 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
10787 {
10788 	vdev_t *rvd = spa->spa_root_vdev;
10789 	uint64_t txg = tx->tx_txg;
10790 
10791 	for (;;) {
10792 		int error = 0;
10793 
10794 		/*
10795 		 * We hold SCL_STATE to prevent vdev open/close/etc.
10796 		 * while we're attempting to write the vdev labels.
10797 		 */
10798 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
10799 
10800 		if (list_is_empty(&spa->spa_config_dirty_list)) {
10801 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
10802 			int svdcount = 0;
10803 			int children = rvd->vdev_children;
10804 			int c0 = random_in_range(children);
10805 
10806 			for (int c = 0; c < children; c++) {
10807 				vdev_t *vd =
10808 				    rvd->vdev_child[(c0 + c) % children];
10809 
10810 				/* Stop when revisiting the first vdev */
10811 				if (c > 0 && svd[0] == vd)
10812 					break;
10813 
10814 				if (vd->vdev_ms_array == 0 ||
10815 				    vd->vdev_islog ||
10816 				    !vdev_is_concrete(vd))
10817 					continue;
10818 
10819 				svd[svdcount++] = vd;
10820 				if (svdcount == SPA_SYNC_MIN_VDEVS)
10821 					break;
10822 			}
10823 			error = vdev_config_sync(svd, svdcount, txg);
10824 		} else {
10825 			error = vdev_config_sync(rvd->vdev_child,
10826 			    rvd->vdev_children, txg);
10827 		}
10828 
10829 		if (error == 0)
10830 			spa->spa_last_synced_guid = rvd->vdev_guid;
10831 
10832 		spa_config_exit(spa, SCL_STATE, FTAG);
10833 
10834 		if (error == 0)
10835 			break;
10836 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
10837 		zio_resume_wait(spa);
10838 	}
10839 }
10840 
10841 /*
10842  * Sync the specified transaction group.  New blocks may be dirtied as
10843  * part of the process, so we iterate until it converges.
10844  */
10845 void
10846 spa_sync(spa_t *spa, uint64_t txg)
10847 {
10848 	vdev_t *vd = NULL;
10849 
10850 	VERIFY(spa_writeable(spa));
10851 
10852 	/*
10853 	 * Wait for i/os issued in open context that need to complete
10854 	 * before this txg syncs.
10855 	 */
10856 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
10857 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
10858 	    ZIO_FLAG_CANFAIL);
10859 
10860 	/*
10861 	 * Now that there can be no more cloning in this transaction group,
10862 	 * but we are still before issuing frees, we can process pending BRT
10863 	 * updates.
10864 	 */
10865 	brt_pending_apply(spa, txg);
10866 
10867 	spa_sync_time_logger(spa, txg, B_FALSE);
10868 
10869 	/*
10870 	 * Lock out configuration changes.
10871 	 */
10872 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
10873 
10874 	spa->spa_syncing_txg = txg;
10875 	spa->spa_sync_pass = 0;
10876 
10877 	/*
10878 	 * If there are any pending vdev state changes, convert them
10879 	 * into config changes that go out with this transaction group.
10880 	 */
10881 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
10882 	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
10883 		/* Avoid holding the write lock unless actually necessary */
10884 		if (vd->vdev_aux == NULL) {
10885 			vdev_state_clean(vd);
10886 			vdev_config_dirty(vd);
10887 			continue;
10888 		}
10889 		/*
10890 		 * We need the write lock here because, for aux vdevs,
10891 		 * calling vdev_config_dirty() modifies sav_config.
10892 		 * This is ugly and will become unnecessary when we
10893 		 * eliminate the aux vdev wart by integrating all vdevs
10894 		 * into the root vdev tree.
10895 		 */
10896 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10897 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
10898 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
10899 			vdev_state_clean(vd);
10900 			vdev_config_dirty(vd);
10901 		}
10902 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10903 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
10904 	}
10905 	spa_config_exit(spa, SCL_STATE, FTAG);
10906 
10907 	dsl_pool_t *dp = spa->spa_dsl_pool;
10908 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
10909 
10910 	spa->spa_sync_starttime = getlrtime();
10911 
10912 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
10913 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
10914 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
10915 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
10916 
10917 	/*
10918 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
10919 	 * set spa_deflate if we have no raid-z vdevs.
10920 	 */
10921 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
10922 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
10923 		vdev_t *rvd = spa->spa_root_vdev;
10924 
10925 		int i;
10926 		for (i = 0; i < rvd->vdev_children; i++) {
10927 			vd = rvd->vdev_child[i];
10928 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
10929 				break;
10930 		}
10931 		if (i == rvd->vdev_children) {
10932 			spa->spa_deflate = TRUE;
10933 			VERIFY0(zap_add(spa->spa_meta_objset,
10934 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
10935 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
10936 		}
10937 	}
10938 
10939 	spa_sync_adjust_vdev_max_queue_depth(spa);
10940 
10941 	spa_sync_condense_indirect(spa, tx);
10942 
10943 	spa_sync_iterate_to_convergence(spa, tx);
10944 
10945 #ifdef ZFS_DEBUG
10946 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
10947 	/*
10948 	 * Make sure that the number of ZAPs for all the vdevs matches
10949 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
10950 	 * called if the config is dirty; otherwise there may be
10951 	 * outstanding AVZ operations that weren't completed in
10952 	 * spa_sync_config_object.
10953 	 */
10954 		uint64_t all_vdev_zap_entry_count;
10955 		ASSERT0(zap_count(spa->spa_meta_objset,
10956 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
10957 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
10958 		    all_vdev_zap_entry_count);
10959 	}
10960 #endif
10961 
10962 	if (spa->spa_vdev_removal != NULL) {
10963 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
10964 	}
10965 
10966 	spa_sync_rewrite_vdev_config(spa, tx);
10967 	dmu_tx_commit(tx);
10968 
10969 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
10970 	spa->spa_deadman_tqid = 0;
10971 
10972 	/*
10973 	 * Clear the dirty config list.
10974 	 */
10975 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
10976 		vdev_config_clean(vd);
10977 
10978 	/*
10979 	 * Now that the new config has synced transactionally,
10980 	 * let it become visible to the config cache.
10981 	 */
10982 	if (spa->spa_config_syncing != NULL) {
10983 		spa_config_set(spa, spa->spa_config_syncing);
10984 		spa->spa_config_txg = txg;
10985 		spa->spa_config_syncing = NULL;
10986 	}
10987 
10988 	dsl_pool_sync_done(dp, txg);
10989 
10990 	/*
10991 	 * Update usable space statistics.
10992 	 */
10993 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
10994 	    != NULL)
10995 		vdev_sync_done(vd, txg);
10996 
10997 	metaslab_class_evict_old(spa->spa_normal_class, txg);
10998 	metaslab_class_evict_old(spa->spa_log_class, txg);
10999 	/* Embedded log classes have only one metaslab per vdev. */
11000 	metaslab_class_evict_old(spa->spa_special_class, txg);
11001 	metaslab_class_evict_old(spa->spa_dedup_class, txg);
11002 
11003 	spa_sync_close_syncing_log_sm(spa);
11004 
11005 	spa_update_dspace(spa);
11006 
11007 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
11008 		vdev_autotrim_kick(spa);
11009 
11010 	/*
11011 	 * It had better be the case that we didn't dirty anything
11012 	 * since vdev_config_sync().
11013 	 */
11014 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
11015 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
11016 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
11017 
11018 	while (zfs_pause_spa_sync)
11019 		delay(1);
11020 
11021 	spa->spa_sync_pass = 0;
11022 
11023 	/*
11024 	 * Update the last synced uberblock here. We want to do this at
11025 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
11026 	 * will be guaranteed that all the processing associated with
11027 	 * that txg has been completed.
11028 	 */
11029 	spa->spa_ubsync = spa->spa_uberblock;
11030 	spa_config_exit(spa, SCL_CONFIG, FTAG);
11031 
11032 	spa_handle_ignored_writes(spa);
11033 
11034 	/*
11035 	 * If any async tasks have been requested, kick them off.
11036 	 */
11037 	spa_async_dispatch(spa);
11038 }
11039 
11040 /*
11041  * Sync all pools.  We don't want to hold the namespace lock across these
11042  * operations, so we take a reference on the spa_t and drop the lock during the
11043  * sync.
11044  */
11045 void
11046 spa_sync_allpools(void)
11047 {
11048 	spa_t *spa = NULL;
11049 	spa_namespace_enter(FTAG);
11050 	while ((spa = spa_next(spa)) != NULL) {
11051 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
11052 		    !spa_writeable(spa) || spa_suspended(spa))
11053 			continue;
11054 		spa_open_ref(spa, FTAG);
11055 		spa_namespace_exit(FTAG);
11056 		txg_wait_synced(spa_get_dsl(spa), 0);
11057 		spa_namespace_enter(FTAG);
11058 		spa_close(spa, FTAG);
11059 	}
11060 	spa_namespace_exit(FTAG);
11061 }
11062 
11063 taskq_t *
11064 spa_sync_tq_create(spa_t *spa, const char *name)
11065 {
11066 	kthread_t **kthreads;
11067 
11068 	ASSERT0P(spa->spa_sync_tq);
11069 	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
11070 
11071 	/*
11072 	 * - do not allow more allocators than cpus.
11073 	 * - there may be more cpus than allocators.
11074 	 * - do not allow more sync taskq threads than allocators or cpus.
11075 	 */
11076 	int nthreads = spa->spa_alloc_count;
11077 	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
11078 	    nthreads, KM_SLEEP);
11079 
11080 	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
11081 	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
11082 	VERIFY(spa->spa_sync_tq != NULL);
11083 	VERIFY(kthreads != NULL);
11084 
11085 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
11086 	for (int i = 0; i < nthreads; i++, ti++) {
11087 		ti->sti_thread = kthreads[i];
11088 		ti->sti_allocator = i;
11089 	}
11090 
11091 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
11092 	return (spa->spa_sync_tq);
11093 }
11094 
11095 void
11096 spa_sync_tq_destroy(spa_t *spa)
11097 {
11098 	ASSERT(spa->spa_sync_tq != NULL);
11099 
11100 	taskq_wait(spa->spa_sync_tq);
11101 	taskq_destroy(spa->spa_sync_tq);
11102 	kmem_free(spa->spa_syncthreads,
11103 	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
11104 	spa->spa_sync_tq = NULL;
11105 }
11106 
11107 uint_t
11108 spa_acq_allocator(spa_t *spa)
11109 {
11110 	int i;
11111 
11112 	if (spa->spa_alloc_count == 1)
11113 		return (0);
11114 
11115 	mutex_enter(&spa->spa_allocs_use->sau_lock);
11116 	uint_t r = spa->spa_allocs_use->sau_rotor;
11117 	do {
11118 		if (++r == spa->spa_alloc_count)
11119 			r = 0;
11120 	} while (spa->spa_allocs_use->sau_inuse[r]);
11121 	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
11122 	spa->spa_allocs_use->sau_rotor = r;
11123 	mutex_exit(&spa->spa_allocs_use->sau_lock);
11124 
11125 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
11126 	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
11127 		if (ti->sti_thread == curthread) {
11128 			ti->sti_allocator = r;
11129 			break;
11130 		}
11131 	}
11132 	ASSERT3S(i, <, spa->spa_alloc_count);
11133 	return (r);
11134 }
11135 
11136 void
11137 spa_rel_allocator(spa_t *spa, uint_t allocator)
11138 {
11139 	if (spa->spa_alloc_count > 1)
11140 		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
11141 }
11142 
11143 void
11144 spa_select_allocator(zio_t *zio)
11145 {
11146 	zbookmark_phys_t *bm = &zio->io_bookmark;
11147 	spa_t *spa = zio->io_spa;
11148 
11149 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
11150 
11151 	/*
11152 	 * A gang block (for example) may have inherited its parent's
11153 	 * allocator, in which case there is nothing further to do here.
11154 	 */
11155 	if (ZIO_HAS_ALLOCATOR(zio))
11156 		return;
11157 
11158 	ASSERT(spa != NULL);
11159 	ASSERT(bm != NULL);
11160 
11161 	/*
11162 	 * First try to use an allocator assigned to the syncthread, and set
11163 	 * the corresponding write issue taskq for the allocator.
11164 	 * Note, we must have an open pool to do this.
11165 	 */
11166 	if (spa->spa_sync_tq != NULL) {
11167 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
11168 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
11169 			if (ti->sti_thread == curthread) {
11170 				zio->io_allocator = ti->sti_allocator;
11171 				return;
11172 			}
11173 		}
11174 	}
11175 
11176 	/*
11177 	 * We want to try to use as many allocators as possible to help improve
11178 	 * performance, but we also want logically adjacent IOs to be physically
11179 	 * adjacent to improve sequential read performance. We chunk each object
11180 	 * into 2^20 block regions, and then hash based on the objset, object,
11181 	 * level, and region to accomplish both of these goals.
11182 	 */
11183 	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
11184 	    bm->zb_blkid >> 20);
11185 
11186 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
11187 }
11188 
11189 /*
11190  * ==========================================================================
11191  * Miscellaneous routines
11192  * ==========================================================================
11193  */
11194 
11195 /*
11196  * Remove all pools in the system.
11197  */
11198 void
11199 spa_evict_all(void)
11200 {
11201 	spa_t *spa;
11202 
11203 	/*
11204 	 * Remove all cached state.  All pools should be closed now,
11205 	 * so every spa in the AVL tree should be unreferenced.
11206 	 */
11207 	spa_namespace_enter(FTAG);
11208 	while ((spa = spa_next(NULL)) != NULL) {
11209 		/*
11210 		 * Stop async tasks.  The async thread may need to detach
11211 		 * a device that's been replaced, which requires grabbing
11212 		 * spa_namespace_lock, so we must drop it here.
11213 		 */
11214 		spa_open_ref(spa, FTAG);
11215 		spa_namespace_exit(FTAG);
11216 		spa_async_suspend(spa);
11217 		spa_namespace_enter(FTAG);
11218 		spa_close(spa, FTAG);
11219 
11220 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
11221 			spa_unload(spa);
11222 			spa_deactivate(spa);
11223 		}
11224 		spa_remove(spa);
11225 	}
11226 	spa_namespace_exit(FTAG);
11227 }
11228 
11229 vdev_t *
11230 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
11231 {
11232 	vdev_t *vd;
11233 	int i;
11234 
11235 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
11236 		return (vd);
11237 
11238 	if (aux) {
11239 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
11240 			vd = spa->spa_l2cache.sav_vdevs[i];
11241 			if (vd->vdev_guid == guid)
11242 				return (vd);
11243 		}
11244 
11245 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
11246 			vd = spa->spa_spares.sav_vdevs[i];
11247 			if (vd->vdev_guid == guid)
11248 				return (vd);
11249 		}
11250 	}
11251 
11252 	return (NULL);
11253 }
11254 
11255 void
11256 spa_upgrade(spa_t *spa, uint64_t version)
11257 {
11258 	ASSERT(spa_writeable(spa));
11259 
11260 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
11261 
11262 	/*
11263 	 * This should only be called for a non-faulted pool, and since a
11264 	 * future version would result in an unopenable pool, this shouldn't be
11265 	 * possible.
11266 	 */
11267 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
11268 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
11269 
11270 	spa->spa_uberblock.ub_version = version;
11271 	vdev_config_dirty(spa->spa_root_vdev);
11272 
11273 	spa_config_exit(spa, SCL_ALL, FTAG);
11274 
11275 	txg_wait_synced(spa_get_dsl(spa), 0);
11276 }
11277 
11278 static boolean_t
11279 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
11280 {
11281 	(void) spa;
11282 	int i;
11283 	uint64_t vdev_guid;
11284 
11285 	for (i = 0; i < sav->sav_count; i++)
11286 		if (sav->sav_vdevs[i]->vdev_guid == guid)
11287 			return (B_TRUE);
11288 
11289 	for (i = 0; i < sav->sav_npending; i++) {
11290 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
11291 		    &vdev_guid) == 0 && vdev_guid == guid)
11292 			return (B_TRUE);
11293 	}
11294 
11295 	return (B_FALSE);
11296 }
11297 
11298 boolean_t
11299 spa_has_l2cache(spa_t *spa, uint64_t guid)
11300 {
11301 	return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
11302 }
11303 
11304 boolean_t
11305 spa_has_spare(spa_t *spa, uint64_t guid)
11306 {
11307 	return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
11308 }
11309 
11310 /*
11311  * Check if a pool has an active shared spare device.
11312  * Note: reference count of an active spare is 2, as a spare and as a replace
11313  */
11314 static boolean_t
11315 spa_has_active_shared_spare(spa_t *spa)
11316 {
11317 	int i, refcnt;
11318 	uint64_t pool;
11319 	spa_aux_vdev_t *sav = &spa->spa_spares;
11320 
11321 	for (i = 0; i < sav->sav_count; i++) {
11322 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
11323 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
11324 		    refcnt > 2)
11325 			return (B_TRUE);
11326 	}
11327 
11328 	return (B_FALSE);
11329 }
11330 
11331 uint64_t
11332 spa_total_metaslabs(spa_t *spa)
11333 {
11334 	vdev_t *rvd = spa->spa_root_vdev;
11335 
11336 	uint64_t m = 0;
11337 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
11338 		vdev_t *vd = rvd->vdev_child[c];
11339 		if (!vdev_is_concrete(vd))
11340 			continue;
11341 		m += vd->vdev_ms_count;
11342 	}
11343 	return (m);
11344 }
11345 
11346 /*
11347  * Notify any waiting threads that some activity has switched from being in-
11348  * progress to not-in-progress so that the thread can wake up and determine
11349  * whether it is finished waiting.
11350  */
11351 void
11352 spa_notify_waiters(spa_t *spa)
11353 {
11354 	/*
11355 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
11356 	 * happening between the waiting thread's check and cv_wait.
11357 	 */
11358 	mutex_enter(&spa->spa_activities_lock);
11359 	cv_broadcast(&spa->spa_activities_cv);
11360 	mutex_exit(&spa->spa_activities_lock);
11361 }
11362 
11363 /*
11364  * Notify any waiting threads that the pool is exporting, and then block until
11365  * they are finished using the spa_t.
11366  */
11367 void
11368 spa_wake_waiters(spa_t *spa)
11369 {
11370 	mutex_enter(&spa->spa_activities_lock);
11371 	spa->spa_waiters_cancel = B_TRUE;
11372 	cv_broadcast(&spa->spa_activities_cv);
11373 	while (spa->spa_waiters != 0)
11374 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
11375 	spa->spa_waiters_cancel = B_FALSE;
11376 	mutex_exit(&spa->spa_activities_lock);
11377 }
11378 
11379 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
11380 static boolean_t
11381 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
11382 {
11383 	spa_t *spa = vd->vdev_spa;
11384 
11385 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
11386 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
11387 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
11388 	    activity == ZPOOL_WAIT_TRIM);
11389 
11390 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
11391 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
11392 
11393 	mutex_exit(&spa->spa_activities_lock);
11394 	mutex_enter(lock);
11395 	mutex_enter(&spa->spa_activities_lock);
11396 
11397 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
11398 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
11399 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
11400 	mutex_exit(lock);
11401 
11402 	if (in_progress)
11403 		return (B_TRUE);
11404 
11405 	for (int i = 0; i < vd->vdev_children; i++) {
11406 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
11407 		    activity))
11408 			return (B_TRUE);
11409 	}
11410 
11411 	return (B_FALSE);
11412 }
11413 
11414 /*
11415  * If use_guid is true, this checks whether the vdev specified by guid is
11416  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
11417  * is being initialized/trimmed. The caller must hold the config lock and
11418  * spa_activities_lock.
11419  */
11420 static int
11421 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
11422     zpool_wait_activity_t activity, boolean_t *in_progress)
11423 {
11424 	mutex_exit(&spa->spa_activities_lock);
11425 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
11426 	mutex_enter(&spa->spa_activities_lock);
11427 
11428 	vdev_t *vd;
11429 	if (use_guid) {
11430 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
11431 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
11432 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
11433 			return (EINVAL);
11434 		}
11435 	} else {
11436 		vd = spa->spa_root_vdev;
11437 	}
11438 
11439 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
11440 
11441 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
11442 	return (0);
11443 }
11444 
11445 /*
11446  * Locking for waiting threads
11447  * ---------------------------
11448  *
11449  * Waiting threads need a way to check whether a given activity is in progress,
11450  * and then, if it is, wait for it to complete. Each activity will have some
11451  * in-memory representation of the relevant on-disk state which can be used to
11452  * determine whether or not the activity is in progress. The in-memory state and
11453  * the locking used to protect it will be different for each activity, and may
11454  * not be suitable for use with a cvar (e.g., some state is protected by the
11455  * config lock). To allow waiting threads to wait without any races, another
11456  * lock, spa_activities_lock, is used.
11457  *
11458  * When the state is checked, both the activity-specific lock (if there is one)
11459  * and spa_activities_lock are held. In some cases, the activity-specific lock
11460  * is acquired explicitly (e.g. the config lock). In others, the locking is
11461  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
11462  * thread releases the activity-specific lock and, if the activity is in
11463  * progress, then cv_waits using spa_activities_lock.
11464  *
11465  * The waiting thread is woken when another thread, one completing some
11466  * activity, updates the state of the activity and then calls
11467  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
11468  * needs to hold its activity-specific lock when updating the state, and this
11469  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
11470  *
11471  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
11472  * and because it is held when the waiting thread checks the state of the
11473  * activity, it can never be the case that the completing thread both updates
11474  * the activity state and cv_broadcasts in between the waiting thread's check
11475  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
11476  *
11477  * In order to prevent deadlock, when the waiting thread does its check, in some
11478  * cases it will temporarily drop spa_activities_lock in order to acquire the
11479  * activity-specific lock. The order in which spa_activities_lock and the
11480  * activity specific lock are acquired in the waiting thread is determined by
11481  * the order in which they are acquired in the completing thread; if the
11482  * completing thread calls spa_notify_waiters with the activity-specific lock
11483  * held, then the waiting thread must also acquire the activity-specific lock
11484  * first.
11485  */
11486 
11487 static int
11488 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
11489     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
11490 {
11491 	int error = 0;
11492 
11493 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
11494 
11495 	switch (activity) {
11496 	case ZPOOL_WAIT_CKPT_DISCARD:
11497 		*in_progress =
11498 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
11499 		    zap_contains(spa_meta_objset(spa),
11500 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
11501 		    ENOENT);
11502 		break;
11503 	case ZPOOL_WAIT_FREE:
11504 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
11505 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
11506 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
11507 		    spa_livelist_delete_check(spa));
11508 		break;
11509 	case ZPOOL_WAIT_INITIALIZE:
11510 	case ZPOOL_WAIT_TRIM:
11511 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
11512 		    activity, in_progress);
11513 		break;
11514 	case ZPOOL_WAIT_REPLACE:
11515 		mutex_exit(&spa->spa_activities_lock);
11516 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
11517 		mutex_enter(&spa->spa_activities_lock);
11518 
11519 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
11520 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
11521 		break;
11522 	case ZPOOL_WAIT_REMOVE:
11523 		*in_progress = (spa->spa_removing_phys.sr_state ==
11524 		    DSS_SCANNING);
11525 		break;
11526 	case ZPOOL_WAIT_RESILVER:
11527 		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
11528 		if (*in_progress)
11529 			break;
11530 		zfs_fallthrough;
11531 	case ZPOOL_WAIT_SCRUB:
11532 	{
11533 		boolean_t scanning, paused, is_scrub;
11534 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
11535 
11536 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
11537 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
11538 		paused = dsl_scan_is_paused_scrub(scn);
11539 		*in_progress = (scanning && !paused &&
11540 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
11541 		break;
11542 	}
11543 	case ZPOOL_WAIT_RAIDZ_EXPAND:
11544 	{
11545 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
11546 		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
11547 		break;
11548 	}
11549 	default:
11550 		panic("unrecognized value for activity %d", activity);
11551 	}
11552 
11553 	return (error);
11554 }
11555 
11556 static int
11557 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
11558     boolean_t use_tag, uint64_t tag, boolean_t *waited)
11559 {
11560 	/*
11561 	 * The tag is used to distinguish between instances of an activity.
11562 	 * 'initialize' and 'trim' are the only activities that we use this for.
11563 	 * The other activities can only have a single instance in progress in a
11564 	 * pool at one time, making the tag unnecessary.
11565 	 *
11566 	 * There can be multiple devices being replaced at once, but since they
11567 	 * all finish once resilvering finishes, we don't bother keeping track
11568 	 * of them individually, we just wait for them all to finish.
11569 	 */
11570 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
11571 	    activity != ZPOOL_WAIT_TRIM)
11572 		return (EINVAL);
11573 
11574 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
11575 		return (EINVAL);
11576 
11577 	spa_t *spa;
11578 	int error = spa_open(pool, &spa, FTAG);
11579 	if (error != 0)
11580 		return (error);
11581 
11582 	/*
11583 	 * Increment the spa's waiter count so that we can call spa_close and
11584 	 * still ensure that the spa_t doesn't get freed before this thread is
11585 	 * finished with it when the pool is exported. We want to call spa_close
11586 	 * before we start waiting because otherwise the additional ref would
11587 	 * prevent the pool from being exported or destroyed throughout the
11588 	 * potentially long wait.
11589 	 */
11590 	mutex_enter(&spa->spa_activities_lock);
11591 	spa->spa_waiters++;
11592 	spa_close(spa, FTAG);
11593 
11594 	*waited = B_FALSE;
11595 	for (;;) {
11596 		boolean_t in_progress;
11597 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
11598 		    &in_progress);
11599 
11600 		if (error || !in_progress || spa->spa_waiters_cancel)
11601 			break;
11602 
11603 		*waited = B_TRUE;
11604 
11605 		if (cv_wait_sig(&spa->spa_activities_cv,
11606 		    &spa->spa_activities_lock) == 0) {
11607 			error = EINTR;
11608 			break;
11609 		}
11610 	}
11611 
11612 	spa->spa_waiters--;
11613 	cv_signal(&spa->spa_waiters_cv);
11614 	mutex_exit(&spa->spa_activities_lock);
11615 
11616 	return (error);
11617 }
11618 
11619 /*
11620  * Wait for a particular instance of the specified activity to complete, where
11621  * the instance is identified by 'tag'
11622  */
11623 int
11624 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
11625     boolean_t *waited)
11626 {
11627 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
11628 }
11629 
11630 /*
11631  * Wait for all instances of the specified activity complete
11632  */
11633 int
11634 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
11635 {
11636 
11637 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
11638 }
11639 
11640 sysevent_t *
11641 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
11642 {
11643 	sysevent_t *ev = NULL;
11644 #ifdef _KERNEL
11645 	nvlist_t *resource;
11646 
11647 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
11648 	if (resource) {
11649 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
11650 		ev->resource = resource;
11651 	}
11652 #else
11653 	(void) spa, (void) vd, (void) hist_nvl, (void) name;
11654 #endif
11655 	return (ev);
11656 }
11657 
11658 void
11659 spa_event_post(sysevent_t *ev)
11660 {
11661 #ifdef _KERNEL
11662 	if (ev) {
11663 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
11664 		kmem_free(ev, sizeof (*ev));
11665 	}
11666 #else
11667 	(void) ev;
11668 #endif
11669 }
11670 
11671 /*
11672  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
11673  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
11674  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
11675  * in the userland libzpool, as we don't want consumers to misinterpret ztest
11676  * or zdb as real changes.
11677  */
11678 void
11679 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
11680 {
11681 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
11682 }
11683 
11684 /* state manipulation functions */
11685 EXPORT_SYMBOL(spa_open);
11686 EXPORT_SYMBOL(spa_open_rewind);
11687 EXPORT_SYMBOL(spa_get_stats);
11688 EXPORT_SYMBOL(spa_create);
11689 EXPORT_SYMBOL(spa_import);
11690 EXPORT_SYMBOL(spa_tryimport);
11691 EXPORT_SYMBOL(spa_destroy);
11692 EXPORT_SYMBOL(spa_export);
11693 EXPORT_SYMBOL(spa_reset);
11694 EXPORT_SYMBOL(spa_async_request);
11695 EXPORT_SYMBOL(spa_async_suspend);
11696 EXPORT_SYMBOL(spa_async_resume);
11697 EXPORT_SYMBOL(spa_inject_addref);
11698 EXPORT_SYMBOL(spa_inject_delref);
11699 EXPORT_SYMBOL(spa_scan_stat_init);
11700 EXPORT_SYMBOL(spa_scan_get_stats);
11701 
11702 /* device manipulation */
11703 EXPORT_SYMBOL(spa_vdev_add);
11704 EXPORT_SYMBOL(spa_vdev_attach);
11705 EXPORT_SYMBOL(spa_vdev_detach);
11706 EXPORT_SYMBOL(spa_vdev_setpath);
11707 EXPORT_SYMBOL(spa_vdev_setfru);
11708 EXPORT_SYMBOL(spa_vdev_split_mirror);
11709 
11710 /* spare statech is global across all pools) */
11711 EXPORT_SYMBOL(spa_spare_add);
11712 EXPORT_SYMBOL(spa_spare_remove);
11713 EXPORT_SYMBOL(spa_spare_exists);
11714 EXPORT_SYMBOL(spa_spare_activate);
11715 
11716 /* L2ARC statech is global across all pools) */
11717 EXPORT_SYMBOL(spa_l2cache_add);
11718 EXPORT_SYMBOL(spa_l2cache_remove);
11719 EXPORT_SYMBOL(spa_l2cache_exists);
11720 EXPORT_SYMBOL(spa_l2cache_activate);
11721 EXPORT_SYMBOL(spa_l2cache_drop);
11722 
11723 /* scanning */
11724 EXPORT_SYMBOL(spa_scan);
11725 EXPORT_SYMBOL(spa_scan_range);
11726 EXPORT_SYMBOL(spa_scan_stop);
11727 
11728 /* spa syncing */
11729 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
11730 EXPORT_SYMBOL(spa_sync_allpools);
11731 
11732 /* properties */
11733 EXPORT_SYMBOL(spa_prop_set);
11734 EXPORT_SYMBOL(spa_prop_get);
11735 EXPORT_SYMBOL(spa_prop_clear_bootfs);
11736 
11737 /* asynchronous event notification */
11738 EXPORT_SYMBOL(spa_event_notify);
11739 
11740 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
11741 	"Percentage of CPUs to run a metaslab preload taskq");
11742 
11743 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
11744 	"log2 fraction of arc that can be used by inflight I/Os when "
11745 	"verifying pool during import");
11746 
11747 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
11748 	"Set to traverse metadata on pool import");
11749 
11750 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
11751 	"Set to traverse data on pool import");
11752 
11753 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
11754 	"Print vdev tree to zfs_dbgmsg during pool import");
11755 
11756 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
11757 	"Percentage of CPUs to run an IO worker thread");
11758 
11759 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
11760 	"Number of threads per IO worker taskqueue");
11761 
11762 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
11763 	"Allow importing pool with up to this number of missing top-level "
11764 	"vdevs (in read-only mode)");
11765 
11766 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
11767 	ZMOD_RW, "Set the livelist condense zthr to pause");
11768 
11769 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
11770 	ZMOD_RW, "Set the livelist condense synctask to pause");
11771 
11772 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
11773 	INT, ZMOD_RW,
11774 	"Whether livelist condensing was canceled in the synctask");
11775 
11776 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
11777 	INT, ZMOD_RW,
11778 	"Whether livelist condensing was canceled in the zthr function");
11779 
11780 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
11781 	ZMOD_RW,
11782 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
11783 	"was being condensed");
11784 
11785 ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
11786 	"How frequently TXG timestamps are stored internally (in seconds)");
11787 
11788 ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
11789 	"How frequently the TXG timestamps database should be flushed "
11790 	"to disk (in seconds)");
11791 
11792 #ifdef _KERNEL
11793 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
11794 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
11795 	"Configure IO queues for read IO");
11796 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
11797 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
11798 	"Configure IO queues for write IO");
11799 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_free,
11800 	spa_taskq_free_param_set, spa_taskq_free_param_get, ZMOD_RW,
11801 	"Configure IO queues for free IO");
11802 #endif
11803 
11804 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
11805 	"Number of CPUs per write issue taskq");
11806