1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2018, Joyent, Inc.
25 * Copyright (c) 2011, 2020, Delphix. All rights reserved.
26 * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
29 * Copyright (c) 2020, George Amanakis. All rights reserved.
30 * Copyright (c) 2019, 2024, 2025, Klara, Inc.
31 * Copyright (c) 2019, Allan Jude
32 * Copyright (c) 2020, The FreeBSD Foundation [1]
33 * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
34 *
35 * [1] Portions of this software were developed by Allan Jude
36 * under sponsorship from the FreeBSD Foundation.
37 */
38
39 /*
40 * DVA-based Adjustable Replacement Cache
41 *
42 * While much of the theory of operation used here is
43 * based on the self-tuning, low overhead replacement cache
44 * presented by Megiddo and Modha at FAST 2003, there are some
45 * significant differences:
46 *
47 * 1. The Megiddo and Modha model assumes any page is evictable.
48 * Pages in its cache cannot be "locked" into memory. This makes
49 * the eviction algorithm simple: evict the last page in the list.
50 * This also make the performance characteristics easy to reason
51 * about. Our cache is not so simple. At any given moment, some
52 * subset of the blocks in the cache are un-evictable because we
53 * have handed out a reference to them. Blocks are only evictable
54 * when there are no external references active. This makes
55 * eviction far more problematic: we choose to evict the evictable
56 * blocks that are the "lowest" in the list.
57 *
58 * There are times when it is not possible to evict the requested
59 * space. In these circumstances we are unable to adjust the cache
60 * size. To prevent the cache growing unbounded at these times we
61 * implement a "cache throttle" that slows the flow of new data
62 * into the cache until we can make space available.
63 *
64 * 2. The Megiddo and Modha model assumes a fixed cache size.
65 * Pages are evicted when the cache is full and there is a cache
66 * miss. Our model has a variable sized cache. It grows with
67 * high use, but also tries to react to memory pressure from the
68 * operating system: decreasing its size when system memory is
69 * tight.
70 *
71 * 3. The Megiddo and Modha model assumes a fixed page size. All
72 * elements of the cache are therefore exactly the same size. So
73 * when adjusting the cache size following a cache miss, its simply
74 * a matter of choosing a single page to evict. In our model, we
75 * have variable sized cache blocks (ranging from 512 bytes to
76 * 128K bytes). We therefore choose a set of blocks to evict to make
77 * space for a cache miss that approximates as closely as possible
78 * the space used by the new block.
79 *
80 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
81 * by N. Megiddo & D. Modha, FAST 2003
82 */
83
84 /*
85 * The locking model:
86 *
87 * A new reference to a cache buffer can be obtained in two
88 * ways: 1) via a hash table lookup using the DVA as a key,
89 * or 2) via one of the ARC lists. The arc_read() interface
90 * uses method 1, while the internal ARC algorithms for
91 * adjusting the cache use method 2. We therefore provide two
92 * types of locks: 1) the hash table lock array, and 2) the
93 * ARC list locks.
94 *
95 * Buffers do not have their own mutexes, rather they rely on the
96 * hash table mutexes for the bulk of their protection (i.e. most
97 * fields in the arc_buf_hdr_t are protected by these mutexes).
98 *
99 * buf_hash_find() returns the appropriate mutex (held) when it
100 * locates the requested buffer in the hash table. It returns
101 * NULL for the mutex if the buffer was not in the table.
102 *
103 * buf_hash_remove() expects the appropriate hash mutex to be
104 * already held before it is invoked.
105 *
106 * Each ARC state also has a mutex which is used to protect the
107 * buffer list associated with the state. When attempting to
108 * obtain a hash table lock while holding an ARC list lock you
109 * must use: mutex_tryenter() to avoid deadlock. Also note that
110 * the active state mutex must be held before the ghost state mutex.
111 *
112 * It as also possible to register a callback which is run when the
113 * metadata limit is reached and no buffers can be safely evicted. In
114 * this case the arc user should drop a reference on some arc buffers so
115 * they can be reclaimed. For example, when using the ZPL each dentry
116 * holds a references on a znode. These dentries must be pruned before
117 * the arc buffer holding the znode can be safely evicted.
118 *
119 * Note that the majority of the performance stats are manipulated
120 * with atomic operations.
121 *
122 * The L2ARC uses the l2ad_mtx on each vdev for the following:
123 *
124 * - L2ARC buflist creation
125 * - L2ARC buflist eviction
126 * - L2ARC write completion, which walks L2ARC buflists
127 * - ARC header destruction, as it removes from L2ARC buflists
128 * - ARC header release, as it removes from L2ARC buflists
129 */
130
131 /*
132 * ARC operation:
133 *
134 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
135 * This structure can point either to a block that is still in the cache or to
136 * one that is only accessible in an L2 ARC device, or it can provide
137 * information about a block that was recently evicted. If a block is
138 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
139 * information to retrieve it from the L2ARC device. This information is
140 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
141 * that is in this state cannot access the data directly.
142 *
143 * Blocks that are actively being referenced or have not been evicted
144 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
145 * the arc_buf_hdr_t that will point to the data block in memory. A block can
146 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
147 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
148 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
149 *
150 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
151 * ability to store the physical data (b_pabd) associated with the DVA of the
152 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
153 * it will match its on-disk compression characteristics. This behavior can be
154 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
155 * compressed ARC functionality is disabled, the b_pabd will point to an
156 * uncompressed version of the on-disk data.
157 *
158 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
159 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
160 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
161 * consumer. The ARC will provide references to this data and will keep it
162 * cached until it is no longer in use. The ARC caches only the L1ARC's physical
163 * data block and will evict any arc_buf_t that is no longer referenced. The
164 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
165 * "overhead_size" kstat.
166 *
167 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
168 * compressed form. The typical case is that consumers will want uncompressed
169 * data, and when that happens a new data buffer is allocated where the data is
170 * decompressed for them to use. Currently the only consumer who wants
171 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
172 * exists on disk. When this happens, the arc_buf_t's data buffer is shared
173 * with the arc_buf_hdr_t.
174 *
175 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
176 * first one is owned by a compressed send consumer (and therefore references
177 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
178 * used by any other consumer (and has its own uncompressed copy of the data
179 * buffer).
180 *
181 * arc_buf_hdr_t
182 * +-----------+
183 * | fields |
184 * | common to |
185 * | L1- and |
186 * | L2ARC |
187 * +-----------+
188 * | l2arc_buf_hdr_t
189 * | |
190 * +-----------+
191 * | l1arc_buf_hdr_t
192 * | | arc_buf_t
193 * | b_buf +------------>+-----------+ arc_buf_t
194 * | b_pabd +-+ |b_next +---->+-----------+
195 * +-----------+ | |-----------| |b_next +-->NULL
196 * | |b_comp = T | +-----------+
197 * | |b_data +-+ |b_comp = F |
198 * | +-----------+ | |b_data +-+
199 * +->+------+ | +-----------+ |
200 * compressed | | | |
201 * data | |<--------------+ | uncompressed
202 * +------+ compressed, | data
203 * shared +-->+------+
204 * data | |
205 * | |
206 * +------+
207 *
208 * When a consumer reads a block, the ARC must first look to see if the
209 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
210 * arc_buf_t and either copies uncompressed data into a new data buffer from an
211 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
212 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
213 * hdr is compressed and the desired compression characteristics of the
214 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
215 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
216 * the last buffer in the hdr's b_buf list, however a shared compressed buf can
217 * be anywhere in the hdr's list.
218 *
219 * The diagram below shows an example of an uncompressed ARC hdr that is
220 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
221 * the last element in the buf list):
222 *
223 * arc_buf_hdr_t
224 * +-----------+
225 * | |
226 * | |
227 * | |
228 * +-----------+
229 * l2arc_buf_hdr_t| |
230 * | |
231 * +-----------+
232 * l1arc_buf_hdr_t| |
233 * | | arc_buf_t (shared)
234 * | b_buf +------------>+---------+ arc_buf_t
235 * | | |b_next +---->+---------+
236 * | b_pabd +-+ |---------| |b_next +-->NULL
237 * +-----------+ | | | +---------+
238 * | |b_data +-+ | |
239 * | +---------+ | |b_data +-+
240 * +->+------+ | +---------+ |
241 * | | | |
242 * uncompressed | | | |
243 * data +------+ | |
244 * ^ +->+------+ |
245 * | uncompressed | | |
246 * | data | | |
247 * | +------+ |
248 * +---------------------------------+
249 *
250 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
251 * since the physical block is about to be rewritten. The new data contents
252 * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
253 * it may compress the data before writing it to disk. The ARC will be called
254 * with the transformed data and will memcpy the transformed on-disk block into
255 * a newly allocated b_pabd. Writes are always done into buffers which have
256 * either been loaned (and hence are new and don't have other readers) or
257 * buffers which have been released (and hence have their own hdr, if there
258 * were originally other readers of the buf's original hdr). This ensures that
259 * the ARC only needs to update a single buf and its hdr after a write occurs.
260 *
261 * When the L2ARC is in use, it will also take advantage of the b_pabd. The
262 * L2ARC will always write the contents of b_pabd to the L2ARC. This means
263 * that when compressed ARC is enabled that the L2ARC blocks are identical
264 * to the on-disk block in the main data pool. This provides a significant
265 * advantage since the ARC can leverage the bp's checksum when reading from the
266 * L2ARC to determine if the contents are valid. However, if the compressed
267 * ARC is disabled, then the L2ARC's block must be transformed to look
268 * like the physical block in the main data pool before comparing the
269 * checksum and determining its validity.
270 *
271 * The L1ARC has a slightly different system for storing encrypted data.
272 * Raw (encrypted + possibly compressed) data has a few subtle differences from
273 * data that is just compressed. The biggest difference is that it is not
274 * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
275 * The other difference is that encryption cannot be treated as a suggestion.
276 * If a caller would prefer compressed data, but they actually wind up with
277 * uncompressed data the worst thing that could happen is there might be a
278 * performance hit. If the caller requests encrypted data, however, we must be
279 * sure they actually get it or else secret information could be leaked. Raw
280 * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
281 * may have both an encrypted version and a decrypted version of its data at
282 * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
283 * copied out of this header. To avoid complications with b_pabd, raw buffers
284 * cannot be shared.
285 */
286
287 #include <sys/spa.h>
288 #include <sys/zio.h>
289 #include <sys/spa_impl.h>
290 #include <sys/zio_compress.h>
291 #include <sys/zio_checksum.h>
292 #include <sys/zfs_context.h>
293 #include <sys/arc.h>
294 #include <sys/zfs_refcount.h>
295 #include <sys/vdev.h>
296 #include <sys/vdev_impl.h>
297 #include <sys/dsl_pool.h>
298 #include <sys/multilist.h>
299 #include <sys/abd.h>
300 #include <sys/dbuf.h>
301 #include <sys/zil.h>
302 #include <sys/fm/fs/zfs.h>
303 #include <sys/callb.h>
304 #include <sys/kstat.h>
305 #include <sys/zthr.h>
306 #include <zfs_fletcher.h>
307 #include <sys/arc_impl.h>
308 #include <sys/trace_zfs.h>
309 #include <sys/aggsum.h>
310 #include <sys/wmsum.h>
311 #include <cityhash.h>
312 #include <sys/vdev_trim.h>
313 #include <sys/zfs_racct.h>
314 #include <sys/zstd/zstd.h>
315
316 #ifndef _KERNEL
317 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
318 boolean_t arc_watch = B_FALSE;
319 #endif
320
321 /*
322 * This thread's job is to keep enough free memory in the system, by
323 * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
324 * arc_available_memory().
325 */
326 static zthr_t *arc_reap_zthr;
327
328 /*
329 * This thread's job is to keep arc_size under arc_c, by calling
330 * arc_evict(), which improves arc_is_overflowing().
331 */
332 static zthr_t *arc_evict_zthr;
333 static arc_buf_hdr_t **arc_state_evict_markers;
334 static int arc_state_evict_marker_count;
335
336 static kmutex_t arc_evict_lock;
337 static boolean_t arc_evict_needed = B_FALSE;
338 static clock_t arc_last_uncached_flush;
339
340 static taskq_t *arc_evict_taskq;
341 static struct evict_arg *arc_evict_arg;
342
343 /*
344 * Count of bytes evicted since boot.
345 */
346 static uint64_t arc_evict_count;
347
348 /*
349 * List of arc_evict_waiter_t's, representing threads waiting for the
350 * arc_evict_count to reach specific values.
351 */
352 static list_t arc_evict_waiters;
353
354 /*
355 * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
356 * the requested amount of data to be evicted. For example, by default for
357 * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
358 * Since this is above 100%, it ensures that progress is made towards getting
359 * arc_size under arc_c. Since this is finite, it ensures that allocations
360 * can still happen, even during the potentially long time that arc_size is
361 * more than arc_c.
362 */
363 static uint_t zfs_arc_eviction_pct = 200;
364
365 /*
366 * The number of headers to evict in arc_evict_state_impl() before
367 * dropping the sublist lock and evicting from another sublist. A lower
368 * value means we're more likely to evict the "correct" header (i.e. the
369 * oldest header in the arc state), but comes with higher overhead
370 * (i.e. more invocations of arc_evict_state_impl()).
371 */
372 static uint_t zfs_arc_evict_batch_limit = 10;
373
374 /*
375 * Number batches to process per parallel eviction task under heavy load to
376 * reduce number of context switches.
377 */
378 static uint_t zfs_arc_evict_batches_limit = 5;
379
380 /* number of seconds before growing cache again */
381 uint_t arc_grow_retry = 5;
382
383 /*
384 * Minimum time between calls to arc_kmem_reap_soon().
385 */
386 static const int arc_kmem_cache_reap_retry_ms = 1000;
387
388 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
389 static int zfs_arc_overflow_shift = 8;
390
391 /* log2(fraction of arc to reclaim) */
392 uint_t arc_shrink_shift = 7;
393
394 #ifdef _KERNEL
395 /* percent of pagecache to reclaim arc to */
396 uint_t zfs_arc_pc_percent = 0;
397 #endif
398
399 /*
400 * log2(fraction of ARC which must be free to allow growing).
401 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
402 * when reading a new block into the ARC, we will evict an equal-sized block
403 * from the ARC.
404 *
405 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
406 * we will still not allow it to grow.
407 */
408 uint_t arc_no_grow_shift = 5;
409
410
411 /*
412 * minimum lifespan of a prefetch block in clock ticks
413 * (initialized in arc_init())
414 */
415 static uint_t arc_min_prefetch;
416 static uint_t arc_min_prescient_prefetch;
417
418 /*
419 * If this percent of memory is free, don't throttle.
420 */
421 uint_t arc_lotsfree_percent = 10;
422
423 /*
424 * The arc has filled available memory and has now warmed up.
425 */
426 boolean_t arc_warm;
427
428 /*
429 * These tunables are for performance analysis.
430 */
431 uint64_t zfs_arc_max = 0;
432 uint64_t zfs_arc_min = 0;
433 static uint64_t zfs_arc_dnode_limit = 0;
434 static uint_t zfs_arc_dnode_reduce_percent = 10;
435 static uint_t zfs_arc_grow_retry = 0;
436 static uint_t zfs_arc_shrink_shift = 0;
437 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
438
439 /*
440 * ARC dirty data constraints for arc_tempreserve_space() throttle:
441 * * total dirty data limit
442 * * anon block dirty limit
443 * * each pool's anon allowance
444 */
445 static const unsigned long zfs_arc_dirty_limit_percent = 50;
446 static const unsigned long zfs_arc_anon_limit_percent = 25;
447 static const unsigned long zfs_arc_pool_dirty_percent = 20;
448
449 /*
450 * Enable or disable compressed arc buffers.
451 */
452 int zfs_compressed_arc_enabled = B_TRUE;
453
454 /*
455 * Balance between metadata and data on ghost hits. Values above 100
456 * increase metadata caching by proportionally reducing effect of ghost
457 * data hits on target data/metadata rate.
458 */
459 static uint_t zfs_arc_meta_balance = 500;
460
461 /*
462 * Percentage that can be consumed by dnodes of ARC meta buffers.
463 */
464 static uint_t zfs_arc_dnode_limit_percent = 10;
465
466 /*
467 * These tunables are Linux-specific
468 */
469 static uint64_t zfs_arc_sys_free = 0;
470 static uint_t zfs_arc_min_prefetch_ms = 0;
471 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
472 static uint_t zfs_arc_lotsfree_percent = 10;
473
474 /*
475 * Number of arc_prune threads
476 */
477 static int zfs_arc_prune_task_threads = 1;
478
479 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
480 static taskq_t *arc_flush_taskq;
481
482 /*
483 * Controls the number of ARC eviction threads to dispatch sublists to.
484 *
485 * Possible values:
486 * 0 (auto) compute the number of threads using a logarithmic formula.
487 * 1 (disabled) one thread - parallel eviction is disabled.
488 * 2+ (manual) set the number manually.
489 *
490 * See arc_evict_thread_init() for how "auto" is computed.
491 */
492 static uint_t zfs_arc_evict_threads = 0;
493
494 /* The 7 states: */
495 arc_state_t ARC_anon;
496 arc_state_t ARC_mru;
497 arc_state_t ARC_mru_ghost;
498 arc_state_t ARC_mfu;
499 arc_state_t ARC_mfu_ghost;
500 arc_state_t ARC_l2c_only;
501 arc_state_t ARC_uncached;
502
503 arc_stats_t arc_stats = {
504 { "hits", KSTAT_DATA_UINT64 },
505 { "iohits", KSTAT_DATA_UINT64 },
506 { "misses", KSTAT_DATA_UINT64 },
507 { "demand_data_hits", KSTAT_DATA_UINT64 },
508 { "demand_data_iohits", KSTAT_DATA_UINT64 },
509 { "demand_data_misses", KSTAT_DATA_UINT64 },
510 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
511 { "demand_metadata_iohits", KSTAT_DATA_UINT64 },
512 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
513 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
514 { "prefetch_data_iohits", KSTAT_DATA_UINT64 },
515 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
516 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
517 { "prefetch_metadata_iohits", KSTAT_DATA_UINT64 },
518 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
519 { "mru_hits", KSTAT_DATA_UINT64 },
520 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
521 { "mfu_hits", KSTAT_DATA_UINT64 },
522 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
523 { "uncached_hits", KSTAT_DATA_UINT64 },
524 { "deleted", KSTAT_DATA_UINT64 },
525 { "mutex_miss", KSTAT_DATA_UINT64 },
526 { "access_skip", KSTAT_DATA_UINT64 },
527 { "evict_skip", KSTAT_DATA_UINT64 },
528 { "evict_not_enough", KSTAT_DATA_UINT64 },
529 { "evict_l2_cached", KSTAT_DATA_UINT64 },
530 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
531 { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
532 { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
533 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
534 { "evict_l2_skip", KSTAT_DATA_UINT64 },
535 { "hash_elements", KSTAT_DATA_UINT64 },
536 { "hash_elements_max", KSTAT_DATA_UINT64 },
537 { "hash_collisions", KSTAT_DATA_UINT64 },
538 { "hash_chains", KSTAT_DATA_UINT64 },
539 { "hash_chain_max", KSTAT_DATA_UINT64 },
540 { "meta", KSTAT_DATA_UINT64 },
541 { "pd", KSTAT_DATA_UINT64 },
542 { "pm", KSTAT_DATA_UINT64 },
543 { "c", KSTAT_DATA_UINT64 },
544 { "c_min", KSTAT_DATA_UINT64 },
545 { "c_max", KSTAT_DATA_UINT64 },
546 { "size", KSTAT_DATA_UINT64 },
547 { "compressed_size", KSTAT_DATA_UINT64 },
548 { "uncompressed_size", KSTAT_DATA_UINT64 },
549 { "overhead_size", KSTAT_DATA_UINT64 },
550 { "hdr_size", KSTAT_DATA_UINT64 },
551 { "data_size", KSTAT_DATA_UINT64 },
552 { "metadata_size", KSTAT_DATA_UINT64 },
553 { "dbuf_size", KSTAT_DATA_UINT64 },
554 { "dnode_size", KSTAT_DATA_UINT64 },
555 { "bonus_size", KSTAT_DATA_UINT64 },
556 #if defined(COMPAT_FREEBSD11)
557 { "other_size", KSTAT_DATA_UINT64 },
558 #endif
559 { "anon_size", KSTAT_DATA_UINT64 },
560 { "anon_data", KSTAT_DATA_UINT64 },
561 { "anon_metadata", KSTAT_DATA_UINT64 },
562 { "anon_evictable_data", KSTAT_DATA_UINT64 },
563 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
564 { "mru_size", KSTAT_DATA_UINT64 },
565 { "mru_data", KSTAT_DATA_UINT64 },
566 { "mru_metadata", KSTAT_DATA_UINT64 },
567 { "mru_evictable_data", KSTAT_DATA_UINT64 },
568 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
569 { "mru_ghost_size", KSTAT_DATA_UINT64 },
570 { "mru_ghost_data", KSTAT_DATA_UINT64 },
571 { "mru_ghost_metadata", KSTAT_DATA_UINT64 },
572 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
573 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
574 { "mfu_size", KSTAT_DATA_UINT64 },
575 { "mfu_data", KSTAT_DATA_UINT64 },
576 { "mfu_metadata", KSTAT_DATA_UINT64 },
577 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
578 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
579 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
580 { "mfu_ghost_data", KSTAT_DATA_UINT64 },
581 { "mfu_ghost_metadata", KSTAT_DATA_UINT64 },
582 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
583 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
584 { "uncached_size", KSTAT_DATA_UINT64 },
585 { "uncached_data", KSTAT_DATA_UINT64 },
586 { "uncached_metadata", KSTAT_DATA_UINT64 },
587 { "uncached_evictable_data", KSTAT_DATA_UINT64 },
588 { "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
589 { "l2_hits", KSTAT_DATA_UINT64 },
590 { "l2_misses", KSTAT_DATA_UINT64 },
591 { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
592 { "l2_mru_asize", KSTAT_DATA_UINT64 },
593 { "l2_mfu_asize", KSTAT_DATA_UINT64 },
594 { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
595 { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
596 { "l2_feeds", KSTAT_DATA_UINT64 },
597 { "l2_rw_clash", KSTAT_DATA_UINT64 },
598 { "l2_read_bytes", KSTAT_DATA_UINT64 },
599 { "l2_write_bytes", KSTAT_DATA_UINT64 },
600 { "l2_writes_sent", KSTAT_DATA_UINT64 },
601 { "l2_writes_done", KSTAT_DATA_UINT64 },
602 { "l2_writes_error", KSTAT_DATA_UINT64 },
603 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
604 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
605 { "l2_evict_reading", KSTAT_DATA_UINT64 },
606 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
607 { "l2_free_on_write", KSTAT_DATA_UINT64 },
608 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
609 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
610 { "l2_io_error", KSTAT_DATA_UINT64 },
611 { "l2_size", KSTAT_DATA_UINT64 },
612 { "l2_asize", KSTAT_DATA_UINT64 },
613 { "l2_hdr_size", KSTAT_DATA_UINT64 },
614 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
615 { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
616 { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
617 { "l2_log_blk_count", KSTAT_DATA_UINT64 },
618 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
619 { "l2_rebuild_success", KSTAT_DATA_UINT64 },
620 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
621 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
622 { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
623 { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
624 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
625 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
626 { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
627 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
628 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
629 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
630 { "memory_throttle_count", KSTAT_DATA_UINT64 },
631 { "memory_direct_count", KSTAT_DATA_UINT64 },
632 { "memory_indirect_count", KSTAT_DATA_UINT64 },
633 { "memory_all_bytes", KSTAT_DATA_UINT64 },
634 { "memory_free_bytes", KSTAT_DATA_UINT64 },
635 { "memory_available_bytes", KSTAT_DATA_INT64 },
636 { "arc_no_grow", KSTAT_DATA_UINT64 },
637 { "arc_tempreserve", KSTAT_DATA_UINT64 },
638 { "arc_loaned_bytes", KSTAT_DATA_UINT64 },
639 { "arc_prune", KSTAT_DATA_UINT64 },
640 { "arc_meta_used", KSTAT_DATA_UINT64 },
641 { "arc_dnode_limit", KSTAT_DATA_UINT64 },
642 { "async_upgrade_sync", KSTAT_DATA_UINT64 },
643 { "predictive_prefetch", KSTAT_DATA_UINT64 },
644 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
645 { "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
646 { "prescient_prefetch", KSTAT_DATA_UINT64 },
647 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
648 { "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
649 { "arc_need_free", KSTAT_DATA_UINT64 },
650 { "arc_sys_free", KSTAT_DATA_UINT64 },
651 { "arc_raw_size", KSTAT_DATA_UINT64 },
652 { "cached_only_in_progress", KSTAT_DATA_UINT64 },
653 { "abd_chunk_waste_size", KSTAT_DATA_UINT64 },
654 };
655
656 arc_sums_t arc_sums;
657
658 #define ARCSTAT_MAX(stat, val) { \
659 uint64_t m; \
660 while ((val) > (m = arc_stats.stat.value.ui64) && \
661 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
662 continue; \
663 }
664
665 /*
666 * We define a macro to allow ARC hits/misses to be easily broken down by
667 * two separate conditions, giving a total of four different subtypes for
668 * each of hits and misses (so eight statistics total).
669 */
670 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
671 if (cond1) { \
672 if (cond2) { \
673 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
674 } else { \
675 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
676 } \
677 } else { \
678 if (cond2) { \
679 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
680 } else { \
681 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
682 } \
683 }
684
685 /*
686 * This macro allows us to use kstats as floating averages. Each time we
687 * update this kstat, we first factor it and the update value by
688 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
689 * average. This macro assumes that integer loads and stores are atomic, but
690 * is not safe for multiple writers updating the kstat in parallel (only the
691 * last writer's update will remain).
692 */
693 #define ARCSTAT_F_AVG_FACTOR 3
694 #define ARCSTAT_F_AVG(stat, value) \
695 do { \
696 uint64_t x = ARCSTAT(stat); \
697 x = x - x / ARCSTAT_F_AVG_FACTOR + \
698 (value) / ARCSTAT_F_AVG_FACTOR; \
699 ARCSTAT(stat) = x; \
700 } while (0)
701
702 static kstat_t *arc_ksp;
703
704 /*
705 * There are several ARC variables that are critical to export as kstats --
706 * but we don't want to have to grovel around in the kstat whenever we wish to
707 * manipulate them. For these variables, we therefore define them to be in
708 * terms of the statistic variable. This assures that we are not introducing
709 * the possibility of inconsistency by having shadow copies of the variables,
710 * while still allowing the code to be readable.
711 */
712 #define arc_tempreserve ARCSTAT(arcstat_tempreserve)
713 #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes)
714 #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
715 #define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */
716
717 hrtime_t arc_growtime;
718 list_t arc_prune_list;
719 kmutex_t arc_prune_mtx;
720 taskq_t *arc_prune_taskq;
721
722 #define GHOST_STATE(state) \
723 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
724 (state) == arc_l2c_only)
725
726 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
727 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
728 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
729 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
730 #define HDR_PRESCIENT_PREFETCH(hdr) \
731 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
732 #define HDR_COMPRESSION_ENABLED(hdr) \
733 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
734
735 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
736 #define HDR_UNCACHED(hdr) ((hdr)->b_flags & ARC_FLAG_UNCACHED)
737 #define HDR_L2_READING(hdr) \
738 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
739 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
740 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
741 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
742 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
743 #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED)
744 #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH)
745 #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
746
747 #define HDR_ISTYPE_METADATA(hdr) \
748 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
749 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
750
751 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
752 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
753 #define HDR_HAS_RABD(hdr) \
754 (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \
755 (hdr)->b_crypt_hdr.b_rabd != NULL)
756 #define HDR_ENCRYPTED(hdr) \
757 (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
758 #define HDR_AUTHENTICATED(hdr) \
759 (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
760
761 /* For storing compression mode in b_flags */
762 #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
763
764 #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
765 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
766 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
767 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
768
769 #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
770 #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
771 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
772 #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
773
774 /*
775 * Other sizes
776 */
777
778 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
779 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
780
781 /*
782 * Hash table routines
783 */
784
785 #define BUF_LOCKS 2048
786 typedef struct buf_hash_table {
787 uint64_t ht_mask;
788 arc_buf_hdr_t **ht_table;
789 kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
790 } buf_hash_table_t;
791
792 static buf_hash_table_t buf_hash_table;
793
794 #define BUF_HASH_INDEX(spa, dva, birth) \
795 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
796 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
797 #define HDR_LOCK(hdr) \
798 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
799
800 uint64_t zfs_crc64_table[256];
801
802 /*
803 * Asynchronous ARC flush
804 *
805 * We track these in a list for arc_async_flush_guid_inuse().
806 * Used for both L1 and L2 async teardown.
807 */
808 static list_t arc_async_flush_list;
809 static kmutex_t arc_async_flush_lock;
810
811 typedef struct arc_async_flush {
812 uint64_t af_spa_guid;
813 taskq_ent_t af_tqent;
814 uint_t af_cache_level; /* 1 or 2 to differentiate node */
815 list_node_t af_node;
816 } arc_async_flush_t;
817
818
819 /*
820 * Level 2 ARC
821 */
822
823 #define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
824 #define L2ARC_BURST_SIZE_MAX (64 * 1024 * 1024) /* max burst size */
825 #define L2ARC_HEADROOM 8 /* num of writes */
826
827 /*
828 * If we discover during ARC scan any buffers to be compressed, we boost
829 * our headroom for the next scanning cycle by this percentage multiple.
830 */
831 #define L2ARC_HEADROOM_BOOST 200
832 #define L2ARC_FEED_SECS 1 /* caching interval secs */
833 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
834
835 /*
836 * Min L2ARC capacity to enable persistent markers, adaptive intervals, and
837 * DWPD rate limiting. L2ARC must be at least twice arc_c_max to benefit from
838 * inclusive caching - smaller L2ARC would either cyclically overwrite itself
839 * (if L2ARC < ARC) or merely duplicate ARC contents (if L2ARC = ARC).
840 * With L2ARC >= 2*ARC, there's room for ARC duplication plus additional
841 * cached data.
842 */
843 #define L2ARC_PERSIST_THRESHOLD (arc_c_max * 2)
844
845 /* L2ARC Performance Tunables */
846 static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */
847 uint64_t l2arc_dwpd_limit = 100; /* 100 = 1.0 DWPD */
848 static uint64_t l2arc_dwpd_bump = 0; /* DWPD reset trigger */
849 static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */
850 static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
851 static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
852 static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */
853 static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
854 static int l2arc_feed_again = B_TRUE; /* turbo warmup */
855 static int l2arc_norw = B_FALSE; /* no reads during writes */
856 static uint_t l2arc_meta_percent = 33; /* limit on headers size */
857
858 /*
859 * L2ARC Internals
860 */
861 static list_t L2ARC_dev_list; /* device list */
862 static list_t *l2arc_dev_list; /* device list pointer */
863 static kmutex_t l2arc_dev_mtx; /* device list mutex */
864 static list_t L2ARC_free_on_write; /* free after write buf list */
865 static list_t *l2arc_free_on_write; /* free after write list ptr */
866 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
867 static uint64_t l2arc_ndev; /* number of devices */
868
869 typedef struct l2arc_read_callback {
870 arc_buf_hdr_t *l2rcb_hdr; /* read header */
871 blkptr_t l2rcb_bp; /* original blkptr */
872 zbookmark_phys_t l2rcb_zb; /* original bookmark */
873 int l2rcb_flags; /* original flags */
874 abd_t *l2rcb_abd; /* temporary buffer */
875 } l2arc_read_callback_t;
876
877 typedef struct l2arc_data_free {
878 /* protected by l2arc_free_on_write_mtx */
879 abd_t *l2df_abd;
880 l2arc_dev_t *l2df_dev; /* L2ARC device that owns this ABD */
881 list_node_t l2df_list_node;
882 } l2arc_data_free_t;
883
884 typedef enum arc_fill_flags {
885 ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
886 ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
887 ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */
888 ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */
889 ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
890 } arc_fill_flags_t;
891
892 typedef enum arc_ovf_level {
893 ARC_OVF_NONE, /* ARC within target size. */
894 ARC_OVF_SOME, /* ARC is slightly overflowed. */
895 ARC_OVF_SEVERE /* ARC is severely overflowed. */
896 } arc_ovf_level_t;
897
898 static kmutex_t l2arc_rebuild_thr_lock;
899 static kcondvar_t l2arc_rebuild_thr_cv;
900
901 enum arc_hdr_alloc_flags {
902 ARC_HDR_ALLOC_RDATA = 0x1,
903 ARC_HDR_USE_RESERVE = 0x4,
904 ARC_HDR_ALLOC_LINEAR = 0x8,
905 };
906
907
908 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
909 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
910 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
911 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
912 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
913 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
914 const void *tag);
915 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
916 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
917 static void arc_hdr_destroy(arc_buf_hdr_t *);
918 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
919 static void arc_buf_watch(arc_buf_t *);
920 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
921
922 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
923 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
924 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
925 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
926
927 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
928 static void l2arc_read_done(zio_t *);
929 static void l2arc_do_free_on_write(l2arc_dev_t *dev);
930 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
931 boolean_t state_only);
932 static uint64_t l2arc_get_write_rate(l2arc_dev_t *dev);
933
934 static void arc_prune_async(uint64_t adjust);
935
936 #define l2arc_hdr_arcstats_increment(hdr) \
937 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
938 #define l2arc_hdr_arcstats_decrement(hdr) \
939 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
940 #define l2arc_hdr_arcstats_increment_state(hdr) \
941 l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
942 #define l2arc_hdr_arcstats_decrement_state(hdr) \
943 l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
944
945 /*
946 * l2arc_exclude_special : A zfs module parameter that controls whether buffers
947 * present on special vdevs are eligibile for caching in L2ARC. If
948 * set to 1, exclude dbufs on special vdevs from being cached to
949 * L2ARC.
950 */
951 int l2arc_exclude_special = 0;
952
953 /*
954 * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
955 * metadata and data are cached from ARC into L2ARC.
956 */
957 static int l2arc_mfuonly = 0;
958
959 /*
960 * Depth cap as percentage of state size. Each pass resets its markers
961 * to tail after scanning this fraction of the state. Keeps markers
962 * focused on the tail zone where L2ARC adds the most value.
963 */
964 static uint64_t l2arc_ext_headroom_pct = 25;
965
966 /*
967 * Metadata monopolization limit. When metadata fills the write budget
968 * for this many consecutive cycles while data gets nothing, skip metadata
969 * for one cycle to let data run, then reset the counter.
970 * With N=2, the steady-state pattern under sustained monopolization is
971 * 2 metadata cycles followed by 1 data cycle (67%/33% split).
972 */
973 static uint64_t l2arc_meta_cycles = 2;
974
975 /*
976 * L2ARC TRIM
977 * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
978 * the current write size (l2arc_write_max) we should TRIM if we
979 * have filled the device. It is defined as a percentage of the
980 * write size. If set to 100 we trim twice the space required to
981 * accommodate upcoming writes. A minimum of 64MB will be trimmed.
982 * It also enables TRIM of the whole L2ARC device upon creation or
983 * addition to an existing pool or if the header of the device is
984 * invalid upon importing a pool or onlining a cache device. The
985 * default is 0, which disables TRIM on L2ARC altogether as it can
986 * put significant stress on the underlying storage devices. This
987 * will vary depending of how well the specific device handles
988 * these commands.
989 */
990 static uint64_t l2arc_trim_ahead = 0;
991
992 /*
993 * Performance tuning of L2ARC persistence:
994 *
995 * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
996 * an L2ARC device (either at pool import or later) will attempt
997 * to rebuild L2ARC buffer contents.
998 * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
999 * whether log blocks are written to the L2ARC device. If the L2ARC
1000 * device is less than 1GB, the amount of data l2arc_evict()
1001 * evicts is significant compared to the amount of restored L2ARC
1002 * data. In this case do not write log blocks in L2ARC in order
1003 * not to waste space.
1004 */
1005 static int l2arc_rebuild_enabled = B_TRUE;
1006 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
1007
1008 /* L2ARC persistence rebuild control routines. */
1009 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
1010 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
1011 static int l2arc_rebuild(l2arc_dev_t *dev);
1012
1013 /* L2ARC persistence read I/O routines. */
1014 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
1015 static int l2arc_log_blk_read(l2arc_dev_t *dev,
1016 const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
1017 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
1018 zio_t *this_io, zio_t **next_io);
1019 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
1020 const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
1021 static void l2arc_log_blk_fetch_abort(zio_t *zio);
1022
1023 /* L2ARC persistence block restoration routines. */
1024 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
1025 const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
1026 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
1027 l2arc_dev_t *dev);
1028
1029 /* L2ARC persistence write I/O routines. */
1030 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1031 l2arc_write_callback_t *cb);
1032
1033 /* L2ARC persistence auxiliary routines. */
1034 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1035 const l2arc_log_blkptr_t *lbp);
1036 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1037 const arc_buf_hdr_t *ab);
1038 boolean_t l2arc_range_check_overlap(uint64_t bottom,
1039 uint64_t top, uint64_t check);
1040 static void l2arc_blk_fetch_done(zio_t *zio);
1041 static inline uint64_t
1042 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
1043
1044 /*
1045 * We use Cityhash for this. It's fast, and has good hash properties without
1046 * requiring any large static buffers.
1047 */
1048 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)1049 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1050 {
1051 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
1052 }
1053
1054 #define HDR_EMPTY(hdr) \
1055 ((hdr)->b_dva.dva_word[0] == 0 && \
1056 (hdr)->b_dva.dva_word[1] == 0)
1057
1058 #define HDR_EMPTY_OR_LOCKED(hdr) \
1059 (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
1060
1061 #define HDR_EQUAL(spa, dva, birth, hdr) \
1062 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1063 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1064 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1065
1066 static void
buf_discard_identity(arc_buf_hdr_t * hdr)1067 buf_discard_identity(arc_buf_hdr_t *hdr)
1068 {
1069 hdr->b_dva.dva_word[0] = 0;
1070 hdr->b_dva.dva_word[1] = 0;
1071 hdr->b_birth = 0;
1072 }
1073
1074 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)1075 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1076 {
1077 const dva_t *dva = BP_IDENTITY(bp);
1078 uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp);
1079 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1080 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1081 arc_buf_hdr_t *hdr;
1082
1083 mutex_enter(hash_lock);
1084 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1085 hdr = hdr->b_hash_next) {
1086 if (HDR_EQUAL(spa, dva, birth, hdr)) {
1087 *lockp = hash_lock;
1088 return (hdr);
1089 }
1090 }
1091 mutex_exit(hash_lock);
1092 *lockp = NULL;
1093 return (NULL);
1094 }
1095
1096 /*
1097 * Insert an entry into the hash table. If there is already an element
1098 * equal to elem in the hash table, then the already existing element
1099 * will be returned and the new element will not be inserted.
1100 * Otherwise returns NULL.
1101 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1102 */
1103 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)1104 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1105 {
1106 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1107 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1108 arc_buf_hdr_t *fhdr;
1109 uint32_t i;
1110
1111 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1112 ASSERT(hdr->b_birth != 0);
1113 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1114
1115 if (lockp != NULL) {
1116 *lockp = hash_lock;
1117 mutex_enter(hash_lock);
1118 } else {
1119 ASSERT(MUTEX_HELD(hash_lock));
1120 }
1121
1122 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1123 fhdr = fhdr->b_hash_next, i++) {
1124 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1125 return (fhdr);
1126 }
1127
1128 hdr->b_hash_next = buf_hash_table.ht_table[idx];
1129 buf_hash_table.ht_table[idx] = hdr;
1130 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1131
1132 /* collect some hash table performance data */
1133 if (i > 0) {
1134 ARCSTAT_BUMP(arcstat_hash_collisions);
1135 if (i == 1)
1136 ARCSTAT_BUMP(arcstat_hash_chains);
1137 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1138 }
1139 ARCSTAT_BUMP(arcstat_hash_elements);
1140
1141 return (NULL);
1142 }
1143
1144 static void
buf_hash_remove(arc_buf_hdr_t * hdr)1145 buf_hash_remove(arc_buf_hdr_t *hdr)
1146 {
1147 arc_buf_hdr_t *fhdr, **hdrp;
1148 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1149
1150 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1151 ASSERT(HDR_IN_HASH_TABLE(hdr));
1152
1153 hdrp = &buf_hash_table.ht_table[idx];
1154 while ((fhdr = *hdrp) != hdr) {
1155 ASSERT3P(fhdr, !=, NULL);
1156 hdrp = &fhdr->b_hash_next;
1157 }
1158 *hdrp = hdr->b_hash_next;
1159 hdr->b_hash_next = NULL;
1160 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1161
1162 /* collect some hash table performance data */
1163 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1164 if (buf_hash_table.ht_table[idx] &&
1165 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1166 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1167 }
1168
1169 /*
1170 * Global data structures and functions for the buf kmem cache.
1171 */
1172
1173 static kmem_cache_t *hdr_full_cache;
1174 static kmem_cache_t *hdr_l2only_cache;
1175 static kmem_cache_t *buf_cache;
1176
1177 static void
buf_fini(void)1178 buf_fini(void)
1179 {
1180 #if defined(_KERNEL)
1181 /*
1182 * Large allocations which do not require contiguous pages
1183 * should be using vmem_free() in the linux kernel.
1184 */
1185 vmem_free(buf_hash_table.ht_table,
1186 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1187 #else
1188 kmem_free(buf_hash_table.ht_table,
1189 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1190 #endif
1191 for (int i = 0; i < BUF_LOCKS; i++)
1192 mutex_destroy(BUF_HASH_LOCK(i));
1193 kmem_cache_destroy(hdr_full_cache);
1194 kmem_cache_destroy(hdr_l2only_cache);
1195 kmem_cache_destroy(buf_cache);
1196 }
1197
1198 /*
1199 * Constructor callback - called when the cache is empty
1200 * and a new buf is requested.
1201 */
1202 static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)1203 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1204 {
1205 (void) unused, (void) kmflag;
1206 arc_buf_hdr_t *hdr = vbuf;
1207
1208 memset(hdr, 0, HDR_FULL_SIZE);
1209 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1210 zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1211 #ifdef ZFS_DEBUG
1212 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1213 #endif
1214 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1215 list_link_init(&hdr->b_l2hdr.b_l2node);
1216 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1217
1218 return (0);
1219 }
1220
1221 static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1222 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1223 {
1224 (void) unused, (void) kmflag;
1225 arc_buf_hdr_t *hdr = vbuf;
1226
1227 memset(hdr, 0, HDR_L2ONLY_SIZE);
1228 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1229
1230 return (0);
1231 }
1232
1233 static int
buf_cons(void * vbuf,void * unused,int kmflag)1234 buf_cons(void *vbuf, void *unused, int kmflag)
1235 {
1236 (void) unused, (void) kmflag;
1237 arc_buf_t *buf = vbuf;
1238
1239 memset(buf, 0, sizeof (arc_buf_t));
1240 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1241
1242 return (0);
1243 }
1244
1245 /*
1246 * Destructor callback - called when a cached buf is
1247 * no longer required.
1248 */
1249 static void
hdr_full_dest(void * vbuf,void * unused)1250 hdr_full_dest(void *vbuf, void *unused)
1251 {
1252 (void) unused;
1253 arc_buf_hdr_t *hdr = vbuf;
1254
1255 ASSERT(HDR_EMPTY(hdr));
1256 zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1257 #ifdef ZFS_DEBUG
1258 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1259 #endif
1260 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1261 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1262 }
1263
1264 static void
hdr_l2only_dest(void * vbuf,void * unused)1265 hdr_l2only_dest(void *vbuf, void *unused)
1266 {
1267 (void) unused;
1268 arc_buf_hdr_t *hdr = vbuf;
1269
1270 ASSERT(HDR_EMPTY(hdr));
1271 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1272 }
1273
1274 static void
buf_dest(void * vbuf,void * unused)1275 buf_dest(void *vbuf, void *unused)
1276 {
1277 (void) unused;
1278 (void) vbuf;
1279
1280 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1281 }
1282
1283 static void
buf_init(void)1284 buf_init(void)
1285 {
1286 uint64_t *ct = NULL;
1287 uint64_t hsize = 1ULL << 12;
1288 int i, j;
1289
1290 /*
1291 * The hash table is big enough to fill all of physical memory
1292 * with an average block size of zfs_arc_average_blocksize (default 8K).
1293 * By default, the table will take up
1294 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1295 */
1296 while (hsize * zfs_arc_average_blocksize < arc_all_memory())
1297 hsize <<= 1;
1298 retry:
1299 buf_hash_table.ht_mask = hsize - 1;
1300 #if defined(_KERNEL)
1301 /*
1302 * Large allocations which do not require contiguous pages
1303 * should be using vmem_alloc() in the linux kernel
1304 */
1305 buf_hash_table.ht_table =
1306 vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
1307 #else
1308 buf_hash_table.ht_table =
1309 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1310 #endif
1311 if (buf_hash_table.ht_table == NULL) {
1312 ASSERT(hsize > (1ULL << 8));
1313 hsize >>= 1;
1314 goto retry;
1315 }
1316
1317 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1318 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
1319 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1320 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
1321 NULL, NULL, 0);
1322 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1323 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1324
1325 for (i = 0; i < 256; i++)
1326 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1327 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1328
1329 for (i = 0; i < BUF_LOCKS; i++)
1330 mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
1331 }
1332
1333 #define ARC_MINTIME (hz>>4) /* 62 ms */
1334
1335 /*
1336 * This is the size that the buf occupies in memory. If the buf is compressed,
1337 * it will correspond to the compressed size. You should use this method of
1338 * getting the buf size unless you explicitly need the logical size.
1339 */
1340 uint64_t
arc_buf_size(arc_buf_t * buf)1341 arc_buf_size(arc_buf_t *buf)
1342 {
1343 return (ARC_BUF_COMPRESSED(buf) ?
1344 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1345 }
1346
1347 uint64_t
arc_buf_lsize(arc_buf_t * buf)1348 arc_buf_lsize(arc_buf_t *buf)
1349 {
1350 return (HDR_GET_LSIZE(buf->b_hdr));
1351 }
1352
1353 /*
1354 * This function will return B_TRUE if the buffer is encrypted in memory.
1355 * This buffer can be decrypted by calling arc_untransform().
1356 */
1357 boolean_t
arc_is_encrypted(arc_buf_t * buf)1358 arc_is_encrypted(arc_buf_t *buf)
1359 {
1360 return (ARC_BUF_ENCRYPTED(buf) != 0);
1361 }
1362
1363 /*
1364 * Returns B_TRUE if the buffer represents data that has not had its MAC
1365 * verified yet.
1366 */
1367 boolean_t
arc_is_unauthenticated(arc_buf_t * buf)1368 arc_is_unauthenticated(arc_buf_t *buf)
1369 {
1370 return (HDR_NOAUTH(buf->b_hdr) != 0);
1371 }
1372
1373 void
arc_get_raw_params(arc_buf_t * buf,boolean_t * byteorder,uint8_t * salt,uint8_t * iv,uint8_t * mac)1374 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1375 uint8_t *iv, uint8_t *mac)
1376 {
1377 arc_buf_hdr_t *hdr = buf->b_hdr;
1378
1379 ASSERT(HDR_PROTECTED(hdr));
1380
1381 memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
1382 memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
1383 memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
1384 *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1385 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1386 }
1387
1388 /*
1389 * Indicates how this buffer is compressed in memory. If it is not compressed
1390 * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1391 * arc_untransform() as long as it is also unencrypted.
1392 */
1393 enum zio_compress
arc_get_compression(arc_buf_t * buf)1394 arc_get_compression(arc_buf_t *buf)
1395 {
1396 return (ARC_BUF_COMPRESSED(buf) ?
1397 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1398 }
1399
1400 /*
1401 * Return the compression algorithm used to store this data in the ARC. If ARC
1402 * compression is enabled or this is an encrypted block, this will be the same
1403 * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1404 */
1405 static inline enum zio_compress
arc_hdr_get_compress(arc_buf_hdr_t * hdr)1406 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1407 {
1408 return (HDR_COMPRESSION_ENABLED(hdr) ?
1409 HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1410 }
1411
1412 uint8_t
arc_get_complevel(arc_buf_t * buf)1413 arc_get_complevel(arc_buf_t *buf)
1414 {
1415 return (buf->b_hdr->b_complevel);
1416 }
1417
1418 __maybe_unused
1419 static inline boolean_t
arc_buf_is_shared(arc_buf_t * buf)1420 arc_buf_is_shared(arc_buf_t *buf)
1421 {
1422 boolean_t shared = (buf->b_data != NULL &&
1423 buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1424 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1425 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1426 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1427 EQUIV(shared, ARC_BUF_SHARED(buf));
1428 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1429
1430 /*
1431 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1432 * already being shared" requirement prevents us from doing that.
1433 */
1434
1435 return (shared);
1436 }
1437
1438 /*
1439 * Free the checksum associated with this header. If there is no checksum, this
1440 * is a no-op.
1441 */
1442 static inline void
arc_cksum_free(arc_buf_hdr_t * hdr)1443 arc_cksum_free(arc_buf_hdr_t *hdr)
1444 {
1445 #ifdef ZFS_DEBUG
1446 ASSERT(HDR_HAS_L1HDR(hdr));
1447
1448 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1449 if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1450 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1451 hdr->b_l1hdr.b_freeze_cksum = NULL;
1452 }
1453 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1454 #endif
1455 }
1456
1457 /*
1458 * Return true iff at least one of the bufs on hdr is not compressed.
1459 * Encrypted buffers count as compressed.
1460 */
1461 static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t * hdr)1462 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1463 {
1464 ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1465
1466 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1467 if (!ARC_BUF_COMPRESSED(b)) {
1468 return (B_TRUE);
1469 }
1470 }
1471 return (B_FALSE);
1472 }
1473
1474
1475 /*
1476 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1477 * matches the checksum that is stored in the hdr. If there is no checksum,
1478 * or if the buf is compressed, this is a no-op.
1479 */
1480 static void
arc_cksum_verify(arc_buf_t * buf)1481 arc_cksum_verify(arc_buf_t *buf)
1482 {
1483 #ifdef ZFS_DEBUG
1484 arc_buf_hdr_t *hdr = buf->b_hdr;
1485 zio_cksum_t zc;
1486
1487 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1488 return;
1489
1490 if (ARC_BUF_COMPRESSED(buf))
1491 return;
1492
1493 ASSERT(HDR_HAS_L1HDR(hdr));
1494
1495 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1496
1497 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1498 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1499 return;
1500 }
1501
1502 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1503 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1504 panic("buffer modified while frozen!");
1505 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1506 #endif
1507 }
1508
1509 /*
1510 * This function makes the assumption that data stored in the L2ARC
1511 * will be transformed exactly as it is in the main pool. Because of
1512 * this we can verify the checksum against the reading process's bp.
1513 */
1514 static boolean_t
arc_cksum_is_equal(arc_buf_hdr_t * hdr,zio_t * zio)1515 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1516 {
1517 ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1518 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1519
1520 /*
1521 * Block pointers always store the checksum for the logical data.
1522 * If the block pointer has the gang bit set, then the checksum
1523 * it represents is for the reconstituted data and not for an
1524 * individual gang member. The zio pipeline, however, must be able to
1525 * determine the checksum of each of the gang constituents so it
1526 * treats the checksum comparison differently than what we need
1527 * for l2arc blocks. This prevents us from using the
1528 * zio_checksum_error() interface directly. Instead we must call the
1529 * zio_checksum_error_impl() so that we can ensure the checksum is
1530 * generated using the correct checksum algorithm and accounts for the
1531 * logical I/O size and not just a gang fragment.
1532 */
1533 return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1534 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1535 zio->io_offset, NULL) == 0);
1536 }
1537
1538 /*
1539 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1540 * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1541 * isn't modified later on. If buf is compressed or there is already a checksum
1542 * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1543 */
1544 static void
arc_cksum_compute(arc_buf_t * buf)1545 arc_cksum_compute(arc_buf_t *buf)
1546 {
1547 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1548 return;
1549
1550 #ifdef ZFS_DEBUG
1551 arc_buf_hdr_t *hdr = buf->b_hdr;
1552 ASSERT(HDR_HAS_L1HDR(hdr));
1553 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1554 if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1555 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1556 return;
1557 }
1558
1559 ASSERT(!ARC_BUF_ENCRYPTED(buf));
1560 ASSERT(!ARC_BUF_COMPRESSED(buf));
1561 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1562 KM_SLEEP);
1563 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1564 hdr->b_l1hdr.b_freeze_cksum);
1565 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1566 #endif
1567 arc_buf_watch(buf);
1568 }
1569
1570 #ifndef _KERNEL
1571 void
arc_buf_sigsegv(int sig,siginfo_t * si,void * unused)1572 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
1573 {
1574 (void) sig, (void) unused;
1575 panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
1576 }
1577 #endif
1578
1579 static void
arc_buf_unwatch(arc_buf_t * buf)1580 arc_buf_unwatch(arc_buf_t *buf)
1581 {
1582 #ifndef _KERNEL
1583 if (arc_watch) {
1584 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1585 PROT_READ | PROT_WRITE));
1586 }
1587 #else
1588 (void) buf;
1589 #endif
1590 }
1591
1592 static void
arc_buf_watch(arc_buf_t * buf)1593 arc_buf_watch(arc_buf_t *buf)
1594 {
1595 #ifndef _KERNEL
1596 if (arc_watch)
1597 ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
1598 PROT_READ));
1599 #else
1600 (void) buf;
1601 #endif
1602 }
1603
1604 static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1605 arc_buf_type(arc_buf_hdr_t *hdr)
1606 {
1607 arc_buf_contents_t type;
1608 if (HDR_ISTYPE_METADATA(hdr)) {
1609 type = ARC_BUFC_METADATA;
1610 } else {
1611 type = ARC_BUFC_DATA;
1612 }
1613 VERIFY3U(hdr->b_type, ==, type);
1614 return (type);
1615 }
1616
1617 boolean_t
arc_is_metadata(arc_buf_t * buf)1618 arc_is_metadata(arc_buf_t *buf)
1619 {
1620 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1621 }
1622
1623 static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1624 arc_bufc_to_flags(arc_buf_contents_t type)
1625 {
1626 switch (type) {
1627 case ARC_BUFC_DATA:
1628 /* metadata field is 0 if buffer contains normal data */
1629 return (0);
1630 case ARC_BUFC_METADATA:
1631 return (ARC_FLAG_BUFC_METADATA);
1632 default:
1633 break;
1634 }
1635 panic("undefined ARC buffer type!");
1636 return ((uint32_t)-1);
1637 }
1638
1639 void
arc_buf_thaw(arc_buf_t * buf)1640 arc_buf_thaw(arc_buf_t *buf)
1641 {
1642 arc_buf_hdr_t *hdr = buf->b_hdr;
1643
1644 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1645 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1646
1647 arc_cksum_verify(buf);
1648
1649 /*
1650 * Compressed buffers do not manipulate the b_freeze_cksum.
1651 */
1652 if (ARC_BUF_COMPRESSED(buf))
1653 return;
1654
1655 ASSERT(HDR_HAS_L1HDR(hdr));
1656 arc_cksum_free(hdr);
1657 arc_buf_unwatch(buf);
1658 }
1659
1660 void
arc_buf_freeze(arc_buf_t * buf)1661 arc_buf_freeze(arc_buf_t *buf)
1662 {
1663 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1664 return;
1665
1666 if (ARC_BUF_COMPRESSED(buf))
1667 return;
1668
1669 ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1670 arc_cksum_compute(buf);
1671 }
1672
1673 /*
1674 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1675 * the following functions should be used to ensure that the flags are
1676 * updated in a thread-safe way. When manipulating the flags either
1677 * the hash_lock must be held or the hdr must be undiscoverable. This
1678 * ensures that we're not racing with any other threads when updating
1679 * the flags.
1680 */
1681 static inline void
arc_hdr_set_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1682 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1683 {
1684 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1685 hdr->b_flags |= flags;
1686 }
1687
1688 static inline void
arc_hdr_clear_flags(arc_buf_hdr_t * hdr,arc_flags_t flags)1689 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1690 {
1691 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1692 hdr->b_flags &= ~flags;
1693 }
1694
1695 /*
1696 * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1697 * done in a special way since we have to clear and set bits
1698 * at the same time. Consumers that wish to set the compression bits
1699 * must use this function to ensure that the flags are updated in
1700 * thread-safe manner.
1701 */
1702 static void
arc_hdr_set_compress(arc_buf_hdr_t * hdr,enum zio_compress cmp)1703 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1704 {
1705 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1706
1707 /*
1708 * Holes and embedded blocks will always have a psize = 0 so
1709 * we ignore the compression of the blkptr and set the
1710 * want to uncompress them. Mark them as uncompressed.
1711 */
1712 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1713 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1714 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1715 } else {
1716 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1717 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1718 }
1719
1720 HDR_SET_COMPRESS(hdr, cmp);
1721 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1722 }
1723
1724 /*
1725 * Looks for another buf on the same hdr which has the data decompressed, copies
1726 * from it, and returns true. If no such buf exists, returns false.
1727 */
1728 static boolean_t
arc_buf_try_copy_decompressed_data(arc_buf_t * buf)1729 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1730 {
1731 arc_buf_hdr_t *hdr = buf->b_hdr;
1732 boolean_t copied = B_FALSE;
1733
1734 ASSERT(HDR_HAS_L1HDR(hdr));
1735 ASSERT3P(buf->b_data, !=, NULL);
1736 ASSERT(!ARC_BUF_COMPRESSED(buf));
1737
1738 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1739 from = from->b_next) {
1740 /* can't use our own data buffer */
1741 if (from == buf) {
1742 continue;
1743 }
1744
1745 if (!ARC_BUF_COMPRESSED(from)) {
1746 memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
1747 copied = B_TRUE;
1748 break;
1749 }
1750 }
1751
1752 #ifdef ZFS_DEBUG
1753 /*
1754 * There were no decompressed bufs, so there should not be a
1755 * checksum on the hdr either.
1756 */
1757 if (zfs_flags & ZFS_DEBUG_MODIFY)
1758 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1759 #endif
1760
1761 return (copied);
1762 }
1763
1764 /*
1765 * Allocates an ARC buf header that's in an evicted & L2-cached state.
1766 * This is used during l2arc reconstruction to make empty ARC buffers
1767 * which circumvent the regular disk->arc->l2arc path and instead come
1768 * into being in the reverse order, i.e. l2arc->arc.
1769 */
1770 static arc_buf_hdr_t *
arc_buf_alloc_l2only(size_t size,arc_buf_contents_t type,l2arc_dev_t * dev,dva_t dva,uint64_t daddr,int32_t psize,uint64_t asize,uint64_t birth,enum zio_compress compress,uint8_t complevel,boolean_t protected,boolean_t prefetch,arc_state_type_t arcs_state)1771 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
1772 dva_t dva, uint64_t daddr, int32_t psize, uint64_t asize, uint64_t birth,
1773 enum zio_compress compress, uint8_t complevel, boolean_t protected,
1774 boolean_t prefetch, arc_state_type_t arcs_state)
1775 {
1776 arc_buf_hdr_t *hdr;
1777
1778 ASSERT(size != 0);
1779 ASSERT(dev->l2ad_vdev != NULL);
1780
1781 hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
1782 hdr->b_birth = birth;
1783 hdr->b_type = type;
1784 hdr->b_flags = 0;
1785 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
1786 HDR_SET_LSIZE(hdr, size);
1787 HDR_SET_PSIZE(hdr, psize);
1788 HDR_SET_L2SIZE(hdr, asize);
1789 arc_hdr_set_compress(hdr, compress);
1790 hdr->b_complevel = complevel;
1791 if (protected)
1792 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
1793 if (prefetch)
1794 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
1795 hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
1796
1797 hdr->b_dva = dva;
1798
1799 hdr->b_l2hdr.b_dev = dev;
1800 hdr->b_l2hdr.b_daddr = daddr;
1801 hdr->b_l2hdr.b_arcs_state = arcs_state;
1802
1803 return (hdr);
1804 }
1805
1806 /*
1807 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1808 */
1809 static uint64_t
arc_hdr_size(arc_buf_hdr_t * hdr)1810 arc_hdr_size(arc_buf_hdr_t *hdr)
1811 {
1812 uint64_t size;
1813
1814 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1815 HDR_GET_PSIZE(hdr) > 0) {
1816 size = HDR_GET_PSIZE(hdr);
1817 } else {
1818 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1819 size = HDR_GET_LSIZE(hdr);
1820 }
1821 return (size);
1822 }
1823
1824 static int
arc_hdr_authenticate(arc_buf_hdr_t * hdr,spa_t * spa,uint64_t dsobj)1825 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1826 {
1827 int ret;
1828 uint64_t csize;
1829 uint64_t lsize = HDR_GET_LSIZE(hdr);
1830 uint64_t psize = HDR_GET_PSIZE(hdr);
1831 abd_t *abd = hdr->b_l1hdr.b_pabd;
1832 boolean_t free_abd = B_FALSE;
1833
1834 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1835 ASSERT(HDR_AUTHENTICATED(hdr));
1836 ASSERT3P(abd, !=, NULL);
1837
1838 /*
1839 * The MAC is calculated on the compressed data that is stored on disk.
1840 * However, if compressed arc is disabled we will only have the
1841 * decompressed data available to us now. Compress it into a temporary
1842 * abd so we can verify the MAC. The performance overhead of this will
1843 * be relatively low, since most objects in an encrypted objset will
1844 * be encrypted (instead of authenticated) anyway.
1845 */
1846 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1847 !HDR_COMPRESSION_ENABLED(hdr)) {
1848 abd = NULL;
1849 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1850 hdr->b_l1hdr.b_pabd, &abd, lsize, MIN(lsize, psize),
1851 hdr->b_complevel);
1852 if (csize >= lsize || csize > psize) {
1853 ret = SET_ERROR(EIO);
1854 return (ret);
1855 }
1856 ASSERT3P(abd, !=, NULL);
1857 abd_zero_off(abd, csize, psize - csize);
1858 free_abd = B_TRUE;
1859 }
1860
1861 /*
1862 * Authentication is best effort. We authenticate whenever the key is
1863 * available. If we succeed we clear ARC_FLAG_NOAUTH.
1864 */
1865 if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1866 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1867 ASSERT3U(lsize, ==, psize);
1868 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1869 psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1870 } else {
1871 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1872 hdr->b_crypt_hdr.b_mac);
1873 }
1874
1875 if (ret == 0)
1876 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1877 else if (ret == ENOENT)
1878 ret = 0;
1879
1880 if (free_abd)
1881 abd_free(abd);
1882
1883 return (ret);
1884 }
1885
1886 /*
1887 * This function will take a header that only has raw encrypted data in
1888 * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1889 * b_l1hdr.b_pabd. If designated in the header flags, this function will
1890 * also decompress the data.
1891 */
1892 static int
arc_hdr_decrypt(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb)1893 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1894 {
1895 int ret;
1896 abd_t *cabd = NULL;
1897 boolean_t no_crypt = B_FALSE;
1898 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1899
1900 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1901 ASSERT(HDR_ENCRYPTED(hdr));
1902
1903 arc_hdr_alloc_abd(hdr, 0);
1904
1905 ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1906 B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1907 hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1908 hdr->b_crypt_hdr.b_rabd, &no_crypt);
1909 if (ret != 0)
1910 goto error;
1911
1912 if (no_crypt) {
1913 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1914 HDR_GET_PSIZE(hdr));
1915 }
1916
1917 /*
1918 * If this header has disabled arc compression but the b_pabd is
1919 * compressed after decrypting it, we need to decompress the newly
1920 * decrypted data.
1921 */
1922 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1923 !HDR_COMPRESSION_ENABLED(hdr)) {
1924 /*
1925 * We want to make sure that we are correctly honoring the
1926 * zfs_abd_scatter_enabled setting, so we allocate an abd here
1927 * and then loan a buffer from it, rather than allocating a
1928 * linear buffer and wrapping it in an abd later.
1929 */
1930 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
1931
1932 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1933 hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
1934 HDR_GET_LSIZE(hdr), &hdr->b_complevel);
1935 if (ret != 0) {
1936 goto error;
1937 }
1938
1939 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1940 arc_hdr_size(hdr), hdr);
1941 hdr->b_l1hdr.b_pabd = cabd;
1942 }
1943
1944 return (0);
1945
1946 error:
1947 arc_hdr_free_abd(hdr, B_FALSE);
1948 if (cabd != NULL)
1949 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
1950
1951 return (ret);
1952 }
1953
1954 /*
1955 * This function is called during arc_buf_fill() to prepare the header's
1956 * abd plaintext pointer for use. This involves authenticated protected
1957 * data and decrypting encrypted data into the plaintext abd.
1958 */
1959 static int
arc_fill_hdr_crypt(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,spa_t * spa,const zbookmark_phys_t * zb,boolean_t noauth)1960 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1961 const zbookmark_phys_t *zb, boolean_t noauth)
1962 {
1963 int ret;
1964
1965 ASSERT(HDR_PROTECTED(hdr));
1966
1967 if (hash_lock != NULL)
1968 mutex_enter(hash_lock);
1969
1970 if (HDR_NOAUTH(hdr) && !noauth) {
1971 /*
1972 * The caller requested authenticated data but our data has
1973 * not been authenticated yet. Verify the MAC now if we can.
1974 */
1975 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1976 if (ret != 0)
1977 goto error;
1978 } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1979 /*
1980 * If we only have the encrypted version of the data, but the
1981 * unencrypted version was requested we take this opportunity
1982 * to store the decrypted version in the header for future use.
1983 */
1984 ret = arc_hdr_decrypt(hdr, spa, zb);
1985 if (ret != 0)
1986 goto error;
1987 }
1988
1989 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1990
1991 if (hash_lock != NULL)
1992 mutex_exit(hash_lock);
1993
1994 return (0);
1995
1996 error:
1997 if (hash_lock != NULL)
1998 mutex_exit(hash_lock);
1999
2000 return (ret);
2001 }
2002
2003 /*
2004 * This function is used by the dbuf code to decrypt bonus buffers in place.
2005 * The dbuf code itself doesn't have any locking for decrypting a shared dnode
2006 * block, so we use the hash lock here to protect against concurrent calls to
2007 * arc_buf_fill().
2008 */
2009 static void
arc_buf_untransform_in_place(arc_buf_t * buf)2010 arc_buf_untransform_in_place(arc_buf_t *buf)
2011 {
2012 arc_buf_hdr_t *hdr = buf->b_hdr;
2013
2014 ASSERT(HDR_ENCRYPTED(hdr));
2015 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2016 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2017 ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
2018
2019 zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
2020 arc_buf_size(buf));
2021 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
2022 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2023 }
2024
2025 /*
2026 * Given a buf that has a data buffer attached to it, this function will
2027 * efficiently fill the buf with data of the specified compression setting from
2028 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2029 * are already sharing a data buf, no copy is performed.
2030 *
2031 * If the buf is marked as compressed but uncompressed data was requested, this
2032 * will allocate a new data buffer for the buf, remove that flag, and fill the
2033 * buf with uncompressed data. You can't request a compressed buf on a hdr with
2034 * uncompressed data, and (since we haven't added support for it yet) if you
2035 * want compressed data your buf must already be marked as compressed and have
2036 * the correct-sized data buffer.
2037 */
2038 static int
arc_buf_fill(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,arc_fill_flags_t flags)2039 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2040 arc_fill_flags_t flags)
2041 {
2042 int error = 0;
2043 arc_buf_hdr_t *hdr = buf->b_hdr;
2044 boolean_t hdr_compressed =
2045 (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
2046 boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
2047 boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
2048 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2049 kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
2050
2051 ASSERT3P(buf->b_data, !=, NULL);
2052 IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
2053 IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
2054 IMPLY(encrypted, HDR_ENCRYPTED(hdr));
2055 IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
2056 IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
2057 IMPLY(encrypted, !arc_buf_is_shared(buf));
2058
2059 /*
2060 * If the caller wanted encrypted data we just need to copy it from
2061 * b_rabd and potentially byteswap it. We won't be able to do any
2062 * further transforms on it.
2063 */
2064 if (encrypted) {
2065 ASSERT(HDR_HAS_RABD(hdr));
2066 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
2067 HDR_GET_PSIZE(hdr));
2068 goto byteswap;
2069 }
2070
2071 /*
2072 * Adjust encrypted and authenticated headers to accommodate
2073 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
2074 * allowed to fail decryption due to keys not being loaded
2075 * without being marked as an IO error.
2076 */
2077 if (HDR_PROTECTED(hdr)) {
2078 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
2079 zb, !!(flags & ARC_FILL_NOAUTH));
2080 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
2081 return (error);
2082 } else if (error != 0) {
2083 if (hash_lock != NULL)
2084 mutex_enter(hash_lock);
2085 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2086 if (hash_lock != NULL)
2087 mutex_exit(hash_lock);
2088 return (error);
2089 }
2090 }
2091
2092 /*
2093 * There is a special case here for dnode blocks which are
2094 * decrypting their bonus buffers. These blocks may request to
2095 * be decrypted in-place. This is necessary because there may
2096 * be many dnodes pointing into this buffer and there is
2097 * currently no method to synchronize replacing the backing
2098 * b_data buffer and updating all of the pointers. Here we use
2099 * the hash lock to ensure there are no races. If the need
2100 * arises for other types to be decrypted in-place, they must
2101 * add handling here as well.
2102 */
2103 if ((flags & ARC_FILL_IN_PLACE) != 0) {
2104 ASSERT(!hdr_compressed);
2105 ASSERT(!compressed);
2106 ASSERT(!encrypted);
2107
2108 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
2109 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2110
2111 if (hash_lock != NULL)
2112 mutex_enter(hash_lock);
2113 arc_buf_untransform_in_place(buf);
2114 if (hash_lock != NULL)
2115 mutex_exit(hash_lock);
2116
2117 /* Compute the hdr's checksum if necessary */
2118 arc_cksum_compute(buf);
2119 }
2120
2121 return (0);
2122 }
2123
2124 if (hdr_compressed == compressed) {
2125 if (ARC_BUF_SHARED(buf)) {
2126 ASSERT(arc_buf_is_shared(buf));
2127 } else {
2128 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2129 arc_buf_size(buf));
2130 }
2131 } else {
2132 ASSERT(hdr_compressed);
2133 ASSERT(!compressed);
2134
2135 /*
2136 * If the buf is sharing its data with the hdr, unlink it and
2137 * allocate a new data buffer for the buf.
2138 */
2139 if (ARC_BUF_SHARED(buf)) {
2140 ASSERTF(ARC_BUF_COMPRESSED(buf),
2141 "buf %p was uncompressed", buf);
2142
2143 /* We need to give the buf its own b_data */
2144 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2145 buf->b_data =
2146 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2147 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2148
2149 /* Previously overhead was 0; just add new overhead */
2150 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2151 } else if (ARC_BUF_COMPRESSED(buf)) {
2152 ASSERT(!arc_buf_is_shared(buf));
2153
2154 /* We need to reallocate the buf's b_data */
2155 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2156 buf);
2157 buf->b_data =
2158 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2159
2160 /* We increased the size of b_data; update overhead */
2161 ARCSTAT_INCR(arcstat_overhead_size,
2162 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2163 }
2164
2165 /*
2166 * Regardless of the buf's previous compression settings, it
2167 * should not be compressed at the end of this function.
2168 */
2169 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2170
2171 /*
2172 * Try copying the data from another buf which already has a
2173 * decompressed version. If that's not possible, it's time to
2174 * bite the bullet and decompress the data from the hdr.
2175 */
2176 if (arc_buf_try_copy_decompressed_data(buf)) {
2177 /* Skip byteswapping and checksumming (already done) */
2178 return (0);
2179 } else {
2180 abd_t dabd;
2181 abd_get_from_buf_struct(&dabd, buf->b_data,
2182 HDR_GET_LSIZE(hdr));
2183 error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2184 hdr->b_l1hdr.b_pabd, &dabd,
2185 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
2186 &hdr->b_complevel);
2187 abd_free(&dabd);
2188
2189 /*
2190 * Absent hardware errors or software bugs, this should
2191 * be impossible, but log it anyway so we can debug it.
2192 */
2193 if (error != 0) {
2194 zfs_dbgmsg(
2195 "hdr %px, compress %d, psize %d, lsize %d",
2196 hdr, arc_hdr_get_compress(hdr),
2197 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2198 if (hash_lock != NULL)
2199 mutex_enter(hash_lock);
2200 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2201 if (hash_lock != NULL)
2202 mutex_exit(hash_lock);
2203 return (SET_ERROR(EIO));
2204 }
2205 }
2206 }
2207
2208 byteswap:
2209 /* Byteswap the buf's data if necessary */
2210 if (bswap != DMU_BSWAP_NUMFUNCS) {
2211 ASSERT(!HDR_SHARED_DATA(hdr));
2212 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2213 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2214 }
2215
2216 /* Compute the hdr's checksum if necessary */
2217 arc_cksum_compute(buf);
2218
2219 return (0);
2220 }
2221
2222 /*
2223 * If this function is being called to decrypt an encrypted buffer or verify an
2224 * authenticated one, the key must be loaded and a mapping must be made
2225 * available in the keystore via spa_keystore_create_mapping() or one of its
2226 * callers.
2227 */
2228 int
arc_untransform(arc_buf_t * buf,spa_t * spa,const zbookmark_phys_t * zb,boolean_t in_place)2229 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2230 boolean_t in_place)
2231 {
2232 int ret;
2233 arc_fill_flags_t flags = 0;
2234
2235 if (in_place)
2236 flags |= ARC_FILL_IN_PLACE;
2237
2238 ret = arc_buf_fill(buf, spa, zb, flags);
2239 if (ret == ECKSUM) {
2240 /*
2241 * Convert authentication and decryption errors to EIO
2242 * (and generate an ereport) before leaving the ARC.
2243 */
2244 ret = SET_ERROR(EIO);
2245 spa_log_error(spa, zb, buf->b_hdr->b_birth);
2246 (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2247 spa, NULL, zb, NULL, 0);
2248 }
2249
2250 return (ret);
2251 }
2252
2253 /*
2254 * Increment the amount of evictable space in the arc_state_t's refcount.
2255 * We account for the space used by the hdr and the arc buf individually
2256 * so that we can add and remove them from the refcount individually.
2257 */
2258 static void
arc_evictable_space_increment(arc_buf_hdr_t * hdr,arc_state_t * state)2259 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2260 {
2261 arc_buf_contents_t type = arc_buf_type(hdr);
2262
2263 ASSERT(HDR_HAS_L1HDR(hdr));
2264
2265 if (GHOST_STATE(state)) {
2266 ASSERT0P(hdr->b_l1hdr.b_buf);
2267 ASSERT0P(hdr->b_l1hdr.b_pabd);
2268 ASSERT(!HDR_HAS_RABD(hdr));
2269 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2270 HDR_GET_LSIZE(hdr), hdr);
2271 return;
2272 }
2273
2274 if (hdr->b_l1hdr.b_pabd != NULL) {
2275 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2276 arc_hdr_size(hdr), hdr);
2277 }
2278 if (HDR_HAS_RABD(hdr)) {
2279 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2280 HDR_GET_PSIZE(hdr), hdr);
2281 }
2282
2283 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2284 buf = buf->b_next) {
2285 if (ARC_BUF_SHARED(buf))
2286 continue;
2287 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2288 arc_buf_size(buf), buf);
2289 }
2290 }
2291
2292 /*
2293 * Decrement the amount of evictable space in the arc_state_t's refcount.
2294 * We account for the space used by the hdr and the arc buf individually
2295 * so that we can add and remove them from the refcount individually.
2296 */
2297 static void
arc_evictable_space_decrement(arc_buf_hdr_t * hdr,arc_state_t * state)2298 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2299 {
2300 arc_buf_contents_t type = arc_buf_type(hdr);
2301
2302 ASSERT(HDR_HAS_L1HDR(hdr));
2303
2304 if (GHOST_STATE(state)) {
2305 ASSERT0P(hdr->b_l1hdr.b_buf);
2306 ASSERT0P(hdr->b_l1hdr.b_pabd);
2307 ASSERT(!HDR_HAS_RABD(hdr));
2308 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2309 HDR_GET_LSIZE(hdr), hdr);
2310 return;
2311 }
2312
2313 if (hdr->b_l1hdr.b_pabd != NULL) {
2314 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2315 arc_hdr_size(hdr), hdr);
2316 }
2317 if (HDR_HAS_RABD(hdr)) {
2318 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2319 HDR_GET_PSIZE(hdr), hdr);
2320 }
2321
2322 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2323 buf = buf->b_next) {
2324 if (ARC_BUF_SHARED(buf))
2325 continue;
2326 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2327 arc_buf_size(buf), buf);
2328 }
2329 }
2330
2331 /*
2332 * Add a reference to this hdr indicating that someone is actively
2333 * referencing that memory. When the refcount transitions from 0 to 1,
2334 * we remove it from the respective arc_state_t list to indicate that
2335 * it is not evictable.
2336 */
2337 static void
add_reference(arc_buf_hdr_t * hdr,const void * tag)2338 add_reference(arc_buf_hdr_t *hdr, const void *tag)
2339 {
2340 arc_state_t *state = hdr->b_l1hdr.b_state;
2341
2342 ASSERT(HDR_HAS_L1HDR(hdr));
2343 if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2344 ASSERT(state == arc_anon);
2345 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2346 ASSERT0P(hdr->b_l1hdr.b_buf);
2347 }
2348
2349 if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2350 state != arc_anon && state != arc_l2c_only) {
2351 /* We don't use the L2-only state list. */
2352 multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
2353 arc_evictable_space_decrement(hdr, state);
2354 }
2355 }
2356
2357 /*
2358 * Remove a reference from this hdr. When the reference transitions from
2359 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2360 * list making it eligible for eviction.
2361 */
2362 static int
remove_reference(arc_buf_hdr_t * hdr,const void * tag)2363 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
2364 {
2365 int cnt;
2366 arc_state_t *state = hdr->b_l1hdr.b_state;
2367
2368 ASSERT(HDR_HAS_L1HDR(hdr));
2369 ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
2370 ASSERT(!GHOST_STATE(state)); /* arc_l2c_only counts as a ghost. */
2371
2372 if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
2373 return (cnt);
2374
2375 if (state == arc_anon) {
2376 arc_hdr_destroy(hdr);
2377 return (0);
2378 }
2379 if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
2380 arc_change_state(arc_anon, hdr);
2381 arc_hdr_destroy(hdr);
2382 return (0);
2383 }
2384 multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
2385 arc_evictable_space_increment(hdr, state);
2386 return (0);
2387 }
2388
2389 /*
2390 * Returns detailed information about a specific arc buffer. When the
2391 * state_index argument is set the function will calculate the arc header
2392 * list position for its arc state. Since this requires a linear traversal
2393 * callers are strongly encourage not to do this. However, it can be helpful
2394 * for targeted analysis so the functionality is provided.
2395 */
2396 void
arc_buf_info(arc_buf_t * ab,arc_buf_info_t * abi,int state_index)2397 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
2398 {
2399 (void) state_index;
2400 arc_buf_hdr_t *hdr = ab->b_hdr;
2401 l1arc_buf_hdr_t *l1hdr = NULL;
2402 l2arc_buf_hdr_t *l2hdr = NULL;
2403 arc_state_t *state = NULL;
2404
2405 memset(abi, 0, sizeof (arc_buf_info_t));
2406
2407 if (hdr == NULL)
2408 return;
2409
2410 abi->abi_flags = hdr->b_flags;
2411
2412 if (HDR_HAS_L1HDR(hdr)) {
2413 l1hdr = &hdr->b_l1hdr;
2414 state = l1hdr->b_state;
2415 }
2416 if (HDR_HAS_L2HDR(hdr))
2417 l2hdr = &hdr->b_l2hdr;
2418
2419 if (l1hdr) {
2420 abi->abi_bufcnt = 0;
2421 for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
2422 abi->abi_bufcnt++;
2423 abi->abi_access = l1hdr->b_arc_access;
2424 abi->abi_mru_hits = l1hdr->b_mru_hits;
2425 abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
2426 abi->abi_mfu_hits = l1hdr->b_mfu_hits;
2427 abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
2428 abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
2429 }
2430
2431 if (l2hdr) {
2432 abi->abi_l2arc_dattr = l2hdr->b_daddr;
2433 abi->abi_l2arc_hits = l2hdr->b_hits;
2434 }
2435
2436 abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
2437 abi->abi_state_contents = arc_buf_type(hdr);
2438 abi->abi_size = arc_hdr_size(hdr);
2439 }
2440
2441 /*
2442 * Move the supplied buffer to the indicated state. The hash lock
2443 * for the buffer must be held by the caller.
2444 */
2445 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr)2446 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
2447 {
2448 arc_state_t *old_state;
2449 int64_t refcnt;
2450 boolean_t update_old, update_new;
2451 arc_buf_contents_t type = arc_buf_type(hdr);
2452
2453 /*
2454 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2455 * in arc_read() when bringing a buffer out of the L2ARC. However, the
2456 * L1 hdr doesn't always exist when we change state to arc_anon before
2457 * destroying a header, in which case reallocating to add the L1 hdr is
2458 * pointless.
2459 */
2460 if (HDR_HAS_L1HDR(hdr)) {
2461 old_state = hdr->b_l1hdr.b_state;
2462 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2463 update_old = (hdr->b_l1hdr.b_buf != NULL ||
2464 hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
2465
2466 IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
2467 IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
2468 IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
2469 ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
2470 } else {
2471 old_state = arc_l2c_only;
2472 refcnt = 0;
2473 update_old = B_FALSE;
2474 }
2475 update_new = update_old;
2476 if (GHOST_STATE(old_state))
2477 update_old = B_TRUE;
2478 if (GHOST_STATE(new_state))
2479 update_new = B_TRUE;
2480
2481 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
2482 ASSERT3P(new_state, !=, old_state);
2483
2484 /*
2485 * If this buffer is evictable, transfer it from the
2486 * old state list to the new state list.
2487 */
2488 if (refcnt == 0) {
2489 if (old_state != arc_anon && old_state != arc_l2c_only) {
2490 ASSERT(HDR_HAS_L1HDR(hdr));
2491 /* remove_reference() saves on insert. */
2492 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2493 multilist_remove(&old_state->arcs_list[type],
2494 hdr);
2495 arc_evictable_space_decrement(hdr, old_state);
2496 }
2497 }
2498 if (new_state != arc_anon && new_state != arc_l2c_only) {
2499 /*
2500 * An L1 header always exists here, since if we're
2501 * moving to some L1-cached state (i.e. not l2c_only or
2502 * anonymous), we realloc the header to add an L1hdr
2503 * beforehand.
2504 */
2505 ASSERT(HDR_HAS_L1HDR(hdr));
2506 multilist_insert(&new_state->arcs_list[type], hdr);
2507 arc_evictable_space_increment(hdr, new_state);
2508 }
2509 }
2510
2511 ASSERT(!HDR_EMPTY(hdr));
2512 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2513 buf_hash_remove(hdr);
2514
2515 /* adjust state sizes (ignore arc_l2c_only) */
2516
2517 if (update_new && new_state != arc_l2c_only) {
2518 ASSERT(HDR_HAS_L1HDR(hdr));
2519 if (GHOST_STATE(new_state)) {
2520
2521 /*
2522 * When moving a header to a ghost state, we first
2523 * remove all arc buffers. Thus, we'll have no arc
2524 * buffer to use for the reference. As a result, we
2525 * use the arc header pointer for the reference.
2526 */
2527 (void) zfs_refcount_add_many(
2528 &new_state->arcs_size[type],
2529 HDR_GET_LSIZE(hdr), hdr);
2530 ASSERT0P(hdr->b_l1hdr.b_pabd);
2531 ASSERT(!HDR_HAS_RABD(hdr));
2532 } else {
2533
2534 /*
2535 * Each individual buffer holds a unique reference,
2536 * thus we must remove each of these references one
2537 * at a time.
2538 */
2539 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2540 buf = buf->b_next) {
2541
2542 /*
2543 * When the arc_buf_t is sharing the data
2544 * block with the hdr, the owner of the
2545 * reference belongs to the hdr. Only
2546 * add to the refcount if the arc_buf_t is
2547 * not shared.
2548 */
2549 if (ARC_BUF_SHARED(buf))
2550 continue;
2551
2552 (void) zfs_refcount_add_many(
2553 &new_state->arcs_size[type],
2554 arc_buf_size(buf), buf);
2555 }
2556
2557 if (hdr->b_l1hdr.b_pabd != NULL) {
2558 (void) zfs_refcount_add_many(
2559 &new_state->arcs_size[type],
2560 arc_hdr_size(hdr), hdr);
2561 }
2562
2563 if (HDR_HAS_RABD(hdr)) {
2564 (void) zfs_refcount_add_many(
2565 &new_state->arcs_size[type],
2566 HDR_GET_PSIZE(hdr), hdr);
2567 }
2568 }
2569 }
2570
2571 if (update_old && old_state != arc_l2c_only) {
2572 ASSERT(HDR_HAS_L1HDR(hdr));
2573 if (GHOST_STATE(old_state)) {
2574 ASSERT0P(hdr->b_l1hdr.b_pabd);
2575 ASSERT(!HDR_HAS_RABD(hdr));
2576
2577 /*
2578 * When moving a header off of a ghost state,
2579 * the header will not contain any arc buffers.
2580 * We use the arc header pointer for the reference
2581 * which is exactly what we did when we put the
2582 * header on the ghost state.
2583 */
2584
2585 (void) zfs_refcount_remove_many(
2586 &old_state->arcs_size[type],
2587 HDR_GET_LSIZE(hdr), hdr);
2588 } else {
2589
2590 /*
2591 * Each individual buffer holds a unique reference,
2592 * thus we must remove each of these references one
2593 * at a time.
2594 */
2595 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2596 buf = buf->b_next) {
2597
2598 /*
2599 * When the arc_buf_t is sharing the data
2600 * block with the hdr, the owner of the
2601 * reference belongs to the hdr. Only
2602 * add to the refcount if the arc_buf_t is
2603 * not shared.
2604 */
2605 if (ARC_BUF_SHARED(buf))
2606 continue;
2607
2608 (void) zfs_refcount_remove_many(
2609 &old_state->arcs_size[type],
2610 arc_buf_size(buf), buf);
2611 }
2612 ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2613 HDR_HAS_RABD(hdr));
2614
2615 if (hdr->b_l1hdr.b_pabd != NULL) {
2616 (void) zfs_refcount_remove_many(
2617 &old_state->arcs_size[type],
2618 arc_hdr_size(hdr), hdr);
2619 }
2620
2621 if (HDR_HAS_RABD(hdr)) {
2622 (void) zfs_refcount_remove_many(
2623 &old_state->arcs_size[type],
2624 HDR_GET_PSIZE(hdr), hdr);
2625 }
2626 }
2627 }
2628
2629 if (HDR_HAS_L1HDR(hdr)) {
2630 hdr->b_l1hdr.b_state = new_state;
2631
2632 if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2633 l2arc_hdr_arcstats_decrement_state(hdr);
2634 hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2635 l2arc_hdr_arcstats_increment_state(hdr);
2636 }
2637 }
2638 }
2639
2640 void
arc_space_consume(uint64_t space,arc_space_type_t type)2641 arc_space_consume(uint64_t space, arc_space_type_t type)
2642 {
2643 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2644
2645 switch (type) {
2646 default:
2647 break;
2648 case ARC_SPACE_DATA:
2649 ARCSTAT_INCR(arcstat_data_size, space);
2650 break;
2651 case ARC_SPACE_META:
2652 ARCSTAT_INCR(arcstat_metadata_size, space);
2653 break;
2654 case ARC_SPACE_BONUS:
2655 ARCSTAT_INCR(arcstat_bonus_size, space);
2656 break;
2657 case ARC_SPACE_DNODE:
2658 aggsum_add(&arc_sums.arcstat_dnode_size, space);
2659 break;
2660 case ARC_SPACE_DBUF:
2661 ARCSTAT_INCR(arcstat_dbuf_size, space);
2662 break;
2663 case ARC_SPACE_HDRS:
2664 ARCSTAT_INCR(arcstat_hdr_size, space);
2665 break;
2666 case ARC_SPACE_L2HDRS:
2667 aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
2668 break;
2669 case ARC_SPACE_ABD_CHUNK_WASTE:
2670 /*
2671 * Note: this includes space wasted by all scatter ABD's, not
2672 * just those allocated by the ARC. But the vast majority of
2673 * scatter ABD's come from the ARC, because other users are
2674 * very short-lived.
2675 */
2676 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
2677 break;
2678 }
2679
2680 if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
2681 ARCSTAT_INCR(arcstat_meta_used, space);
2682
2683 aggsum_add(&arc_sums.arcstat_size, space);
2684 }
2685
2686 void
arc_space_return(uint64_t space,arc_space_type_t type)2687 arc_space_return(uint64_t space, arc_space_type_t type)
2688 {
2689 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2690
2691 switch (type) {
2692 default:
2693 break;
2694 case ARC_SPACE_DATA:
2695 ARCSTAT_INCR(arcstat_data_size, -space);
2696 break;
2697 case ARC_SPACE_META:
2698 ARCSTAT_INCR(arcstat_metadata_size, -space);
2699 break;
2700 case ARC_SPACE_BONUS:
2701 ARCSTAT_INCR(arcstat_bonus_size, -space);
2702 break;
2703 case ARC_SPACE_DNODE:
2704 aggsum_add(&arc_sums.arcstat_dnode_size, -space);
2705 break;
2706 case ARC_SPACE_DBUF:
2707 ARCSTAT_INCR(arcstat_dbuf_size, -space);
2708 break;
2709 case ARC_SPACE_HDRS:
2710 ARCSTAT_INCR(arcstat_hdr_size, -space);
2711 break;
2712 case ARC_SPACE_L2HDRS:
2713 aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
2714 break;
2715 case ARC_SPACE_ABD_CHUNK_WASTE:
2716 ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
2717 break;
2718 }
2719
2720 if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
2721 ARCSTAT_INCR(arcstat_meta_used, -space);
2722
2723 ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
2724 aggsum_add(&arc_sums.arcstat_size, -space);
2725 }
2726
2727 /*
2728 * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2729 * with the hdr's b_pabd.
2730 */
2731 static boolean_t
arc_can_share(arc_buf_hdr_t * hdr,arc_buf_t * buf)2732 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2733 {
2734 /*
2735 * The criteria for sharing a hdr's data are:
2736 * 1. the buffer is not encrypted
2737 * 2. the hdr's compression matches the buf's compression
2738 * 3. the hdr doesn't need to be byteswapped
2739 * 4. the hdr isn't already being shared
2740 * 5. the buf is either compressed or it is the last buf in the hdr list
2741 *
2742 * Criterion #5 maintains the invariant that shared uncompressed
2743 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2744 * might ask, "if a compressed buf is allocated first, won't that be the
2745 * last thing in the list?", but in that case it's impossible to create
2746 * a shared uncompressed buf anyway (because the hdr must be compressed
2747 * to have the compressed buf). You might also think that #3 is
2748 * sufficient to make this guarantee, however it's possible
2749 * (specifically in the rare L2ARC write race mentioned in
2750 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2751 * is shareable, but wasn't at the time of its allocation. Rather than
2752 * allow a new shared uncompressed buf to be created and then shuffle
2753 * the list around to make it the last element, this simply disallows
2754 * sharing if the new buf isn't the first to be added.
2755 */
2756 ASSERT3P(buf->b_hdr, ==, hdr);
2757 boolean_t hdr_compressed =
2758 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
2759 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2760 return (!ARC_BUF_ENCRYPTED(buf) &&
2761 buf_compressed == hdr_compressed &&
2762 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2763 !HDR_SHARED_DATA(hdr) &&
2764 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2765 }
2766
2767 /*
2768 * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2769 * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2770 * copy was made successfully, or an error code otherwise.
2771 */
2772 static int
arc_buf_alloc_impl(arc_buf_hdr_t * hdr,spa_t * spa,const zbookmark_phys_t * zb,const void * tag,boolean_t encrypted,boolean_t compressed,boolean_t noauth,boolean_t fill,arc_buf_t ** ret)2773 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2774 const void *tag, boolean_t encrypted, boolean_t compressed,
2775 boolean_t noauth, boolean_t fill, arc_buf_t **ret)
2776 {
2777 arc_buf_t *buf;
2778 arc_fill_flags_t flags = ARC_FILL_LOCKED;
2779
2780 ASSERT(HDR_HAS_L1HDR(hdr));
2781 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2782 VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2783 hdr->b_type == ARC_BUFC_METADATA);
2784 ASSERT3P(ret, !=, NULL);
2785 ASSERT0P(*ret);
2786 IMPLY(encrypted, compressed);
2787
2788 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2789 buf->b_hdr = hdr;
2790 buf->b_data = NULL;
2791 buf->b_next = hdr->b_l1hdr.b_buf;
2792 buf->b_flags = 0;
2793
2794 add_reference(hdr, tag);
2795
2796 /*
2797 * We're about to change the hdr's b_flags. We must either
2798 * hold the hash_lock or be undiscoverable.
2799 */
2800 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2801
2802 /*
2803 * Only honor requests for compressed bufs if the hdr is actually
2804 * compressed. This must be overridden if the buffer is encrypted since
2805 * encrypted buffers cannot be decompressed.
2806 */
2807 if (encrypted) {
2808 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2809 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2810 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2811 } else if (compressed &&
2812 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2813 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2814 flags |= ARC_FILL_COMPRESSED;
2815 }
2816
2817 if (noauth) {
2818 ASSERT0(encrypted);
2819 flags |= ARC_FILL_NOAUTH;
2820 }
2821
2822 /*
2823 * If the hdr's data can be shared then we share the data buffer and
2824 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2825 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
2826 * buffer to store the buf's data.
2827 *
2828 * There are two additional restrictions here because we're sharing
2829 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2830 * actively involved in an L2ARC write, because if this buf is used by
2831 * an arc_write() then the hdr's data buffer will be released when the
2832 * write completes, even though the L2ARC write might still be using it.
2833 * Second, the hdr's ABD must be linear so that the buf's user doesn't
2834 * need to be ABD-aware. It must be allocated via
2835 * zio_[data_]buf_alloc(), not as a page, because we need to be able
2836 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
2837 * page" buffers because the ABD code needs to handle freeing them
2838 * specially.
2839 */
2840 boolean_t can_share = arc_can_share(hdr, buf) &&
2841 !HDR_L2_WRITING(hdr) &&
2842 hdr->b_l1hdr.b_pabd != NULL &&
2843 abd_is_linear(hdr->b_l1hdr.b_pabd) &&
2844 !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
2845
2846 /* Set up b_data and sharing */
2847 if (can_share) {
2848 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2849 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2850 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2851 } else {
2852 buf->b_data =
2853 arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2854 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2855 }
2856 VERIFY3P(buf->b_data, !=, NULL);
2857
2858 hdr->b_l1hdr.b_buf = buf;
2859
2860 /*
2861 * If the user wants the data from the hdr, we need to either copy or
2862 * decompress the data.
2863 */
2864 if (fill) {
2865 ASSERT3P(zb, !=, NULL);
2866 return (arc_buf_fill(buf, spa, zb, flags));
2867 }
2868
2869 return (0);
2870 }
2871
2872 static const char *arc_onloan_tag = "onloan";
2873
2874 static inline void
arc_loaned_bytes_update(int64_t delta)2875 arc_loaned_bytes_update(int64_t delta)
2876 {
2877 atomic_add_64(&arc_loaned_bytes, delta);
2878
2879 /* assert that it did not wrap around */
2880 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2881 }
2882
2883 /*
2884 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2885 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2886 * buffers must be returned to the arc before they can be used by the DMU or
2887 * freed.
2888 */
2889 arc_buf_t *
arc_loan_buf(spa_t * spa,boolean_t is_metadata,int size)2890 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2891 {
2892 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2893 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2894
2895 arc_loaned_bytes_update(arc_buf_size(buf));
2896
2897 return (buf);
2898 }
2899
2900 arc_buf_t *
arc_loan_compressed_buf(spa_t * spa,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)2901 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2902 enum zio_compress compression_type, uint8_t complevel)
2903 {
2904 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2905 psize, lsize, compression_type, complevel);
2906
2907 arc_loaned_bytes_update(arc_buf_size(buf));
2908
2909 return (buf);
2910 }
2911
2912 arc_buf_t *
arc_loan_raw_buf(spa_t * spa,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)2913 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2914 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2915 dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2916 enum zio_compress compression_type, uint8_t complevel)
2917 {
2918 arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2919 byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
2920 complevel);
2921
2922 atomic_add_64(&arc_loaned_bytes, psize);
2923 return (buf);
2924 }
2925
2926
2927 /*
2928 * Return a loaned arc buffer to the arc.
2929 */
2930 void
arc_return_buf(arc_buf_t * buf,const void * tag)2931 arc_return_buf(arc_buf_t *buf, const void *tag)
2932 {
2933 arc_buf_hdr_t *hdr = buf->b_hdr;
2934
2935 ASSERT3P(buf->b_data, !=, NULL);
2936 ASSERT(HDR_HAS_L1HDR(hdr));
2937 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2938 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2939
2940 arc_loaned_bytes_update(-arc_buf_size(buf));
2941 }
2942
2943 /* Detach an arc_buf from a dbuf (tag) */
2944 void
arc_loan_inuse_buf(arc_buf_t * buf,const void * tag)2945 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
2946 {
2947 arc_buf_hdr_t *hdr = buf->b_hdr;
2948
2949 ASSERT3P(buf->b_data, !=, NULL);
2950 ASSERT(HDR_HAS_L1HDR(hdr));
2951 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2952 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2953
2954 arc_loaned_bytes_update(arc_buf_size(buf));
2955 }
2956
2957 static void
l2arc_free_abd_on_write(abd_t * abd,l2arc_dev_t * dev)2958 l2arc_free_abd_on_write(abd_t *abd, l2arc_dev_t *dev)
2959 {
2960 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2961
2962 df->l2df_abd = abd;
2963 df->l2df_dev = dev;
2964 mutex_enter(&l2arc_free_on_write_mtx);
2965 list_insert_head(l2arc_free_on_write, df);
2966 mutex_exit(&l2arc_free_on_write_mtx);
2967 }
2968
2969 static void
arc_hdr_free_on_write(arc_buf_hdr_t * hdr,boolean_t free_rdata)2970 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2971 {
2972 arc_state_t *state = hdr->b_l1hdr.b_state;
2973 arc_buf_contents_t type = arc_buf_type(hdr);
2974 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2975
2976 /* protected by hash lock, if in the hash table */
2977 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2978 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2979 ASSERT(state != arc_anon && state != arc_l2c_only);
2980
2981 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2982 size, hdr);
2983 }
2984 (void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
2985 if (type == ARC_BUFC_METADATA) {
2986 arc_space_return(size, ARC_SPACE_META);
2987 } else {
2988 ASSERT(type == ARC_BUFC_DATA);
2989 arc_space_return(size, ARC_SPACE_DATA);
2990 }
2991
2992 /*
2993 * L2HDR must exist since we're freeing an L2ARC-related ABD.
2994 */
2995 ASSERT(HDR_HAS_L2HDR(hdr));
2996
2997 if (free_rdata) {
2998 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd,
2999 hdr->b_l2hdr.b_dev);
3000 } else {
3001 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd,
3002 hdr->b_l2hdr.b_dev);
3003 }
3004 }
3005
3006 /*
3007 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
3008 * data buffer, we transfer the refcount ownership to the hdr and update
3009 * the appropriate kstats.
3010 */
3011 static void
arc_share_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)3012 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3013 {
3014 ASSERT(arc_can_share(hdr, buf));
3015 ASSERT0P(hdr->b_l1hdr.b_pabd);
3016 ASSERT(!ARC_BUF_ENCRYPTED(buf));
3017 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3018
3019 /*
3020 * Start sharing the data buffer. We transfer the
3021 * refcount ownership to the hdr since it always owns
3022 * the refcount whenever an arc_buf_t is shared.
3023 */
3024 zfs_refcount_transfer_ownership_many(
3025 &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
3026 arc_hdr_size(hdr), buf, hdr);
3027 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3028 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3029 HDR_ISTYPE_METADATA(hdr));
3030 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3031 buf->b_flags |= ARC_BUF_FLAG_SHARED;
3032
3033 /*
3034 * Since we've transferred ownership to the hdr we need
3035 * to increment its compressed and uncompressed kstats and
3036 * decrement the overhead size.
3037 */
3038 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3039 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3040 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
3041 }
3042
3043 static void
arc_unshare_buf(arc_buf_hdr_t * hdr,arc_buf_t * buf)3044 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3045 {
3046 ASSERT(arc_buf_is_shared(buf));
3047 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3048 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3049
3050 /*
3051 * We are no longer sharing this buffer so we need
3052 * to transfer its ownership to the rightful owner.
3053 */
3054 zfs_refcount_transfer_ownership_many(
3055 &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
3056 arc_hdr_size(hdr), hdr, buf);
3057 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3058 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3059 abd_free(hdr->b_l1hdr.b_pabd);
3060 hdr->b_l1hdr.b_pabd = NULL;
3061 buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3062
3063 /*
3064 * Since the buffer is no longer shared between
3065 * the arc buf and the hdr, count it as overhead.
3066 */
3067 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3068 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3069 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3070 }
3071
3072 /*
3073 * Remove an arc_buf_t from the hdr's buf list and return the last
3074 * arc_buf_t on the list. If no buffers remain on the list then return
3075 * NULL.
3076 */
3077 static arc_buf_t *
arc_buf_remove(arc_buf_hdr_t * hdr,arc_buf_t * buf)3078 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3079 {
3080 ASSERT(HDR_HAS_L1HDR(hdr));
3081 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3082
3083 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3084 arc_buf_t *lastbuf = NULL;
3085
3086 /*
3087 * Remove the buf from the hdr list and locate the last
3088 * remaining buffer on the list.
3089 */
3090 while (*bufp != NULL) {
3091 if (*bufp == buf)
3092 *bufp = buf->b_next;
3093
3094 /*
3095 * If we've removed a buffer in the middle of
3096 * the list then update the lastbuf and update
3097 * bufp.
3098 */
3099 if (*bufp != NULL) {
3100 lastbuf = *bufp;
3101 bufp = &(*bufp)->b_next;
3102 }
3103 }
3104 buf->b_next = NULL;
3105 ASSERT3P(lastbuf, !=, buf);
3106 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3107
3108 return (lastbuf);
3109 }
3110
3111 /*
3112 * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
3113 * list and free it.
3114 */
3115 static void
arc_buf_destroy_impl(arc_buf_t * buf)3116 arc_buf_destroy_impl(arc_buf_t *buf)
3117 {
3118 arc_buf_hdr_t *hdr = buf->b_hdr;
3119
3120 /*
3121 * Free up the data associated with the buf but only if we're not
3122 * sharing this with the hdr. If we are sharing it with the hdr, the
3123 * hdr is responsible for doing the free.
3124 */
3125 if (buf->b_data != NULL) {
3126 /*
3127 * We're about to change the hdr's b_flags. We must either
3128 * hold the hash_lock or be undiscoverable.
3129 */
3130 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
3131
3132 arc_cksum_verify(buf);
3133 arc_buf_unwatch(buf);
3134
3135 if (ARC_BUF_SHARED(buf)) {
3136 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3137 } else {
3138 ASSERT(!arc_buf_is_shared(buf));
3139 uint64_t size = arc_buf_size(buf);
3140 arc_free_data_buf(hdr, buf->b_data, size, buf);
3141 ARCSTAT_INCR(arcstat_overhead_size, -size);
3142 }
3143 buf->b_data = NULL;
3144
3145 /*
3146 * If we have no more encrypted buffers and we've already
3147 * gotten a copy of the decrypted data we can free b_rabd
3148 * to save some space.
3149 */
3150 if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
3151 hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
3152 arc_buf_t *b;
3153 for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
3154 if (b != buf && ARC_BUF_ENCRYPTED(b))
3155 break;
3156 }
3157 if (b == NULL)
3158 arc_hdr_free_abd(hdr, B_TRUE);
3159 }
3160 }
3161
3162 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3163
3164 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3165 /*
3166 * If the current arc_buf_t is sharing its data buffer with the
3167 * hdr, then reassign the hdr's b_pabd to share it with the new
3168 * buffer at the end of the list. The shared buffer is always
3169 * the last one on the hdr's buffer list.
3170 *
3171 * There is an equivalent case for compressed bufs, but since
3172 * they aren't guaranteed to be the last buf in the list and
3173 * that is an exceedingly rare case, we just allow that space be
3174 * wasted temporarily. We must also be careful not to share
3175 * encrypted buffers, since they cannot be shared.
3176 */
3177 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3178 /* Only one buf can be shared at once */
3179 ASSERT(!arc_buf_is_shared(lastbuf));
3180 /* hdr is uncompressed so can't have compressed buf */
3181 ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
3182
3183 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3184 arc_hdr_free_abd(hdr, B_FALSE);
3185
3186 /*
3187 * We must setup a new shared block between the
3188 * last buffer and the hdr. The data would have
3189 * been allocated by the arc buf so we need to transfer
3190 * ownership to the hdr since it's now being shared.
3191 */
3192 arc_share_buf(hdr, lastbuf);
3193 }
3194 } else if (HDR_SHARED_DATA(hdr)) {
3195 /*
3196 * Uncompressed shared buffers are always at the end
3197 * of the list. Compressed buffers don't have the
3198 * same requirements. This makes it hard to
3199 * simply assert that the lastbuf is shared so
3200 * we rely on the hdr's compression flags to determine
3201 * if we have a compressed, shared buffer.
3202 */
3203 ASSERT3P(lastbuf, !=, NULL);
3204 ASSERT(arc_buf_is_shared(lastbuf) ||
3205 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3206 }
3207
3208 /*
3209 * Free the checksum if we're removing the last uncompressed buf from
3210 * this hdr.
3211 */
3212 if (!arc_hdr_has_uncompressed_buf(hdr)) {
3213 arc_cksum_free(hdr);
3214 }
3215
3216 /* clean up the buf */
3217 buf->b_hdr = NULL;
3218 kmem_cache_free(buf_cache, buf);
3219 }
3220
3221 static void
arc_hdr_alloc_abd(arc_buf_hdr_t * hdr,int alloc_flags)3222 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
3223 {
3224 uint64_t size;
3225 boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3226
3227 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3228 ASSERT(HDR_HAS_L1HDR(hdr));
3229 ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3230 IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3231
3232 if (alloc_rdata) {
3233 size = HDR_GET_PSIZE(hdr);
3234 ASSERT0P(hdr->b_crypt_hdr.b_rabd);
3235 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3236 alloc_flags);
3237 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3238 ARCSTAT_INCR(arcstat_raw_size, size);
3239 } else {
3240 size = arc_hdr_size(hdr);
3241 ASSERT0P(hdr->b_l1hdr.b_pabd);
3242 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3243 alloc_flags);
3244 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3245 }
3246
3247 ARCSTAT_INCR(arcstat_compressed_size, size);
3248 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3249 }
3250
3251 static void
arc_hdr_free_abd(arc_buf_hdr_t * hdr,boolean_t free_rdata)3252 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3253 {
3254 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3255
3256 ASSERT(HDR_HAS_L1HDR(hdr));
3257 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3258 IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3259
3260 /*
3261 * If the hdr is currently being written to the l2arc then
3262 * we defer freeing the data by adding it to the l2arc_free_on_write
3263 * list. The l2arc will free the data once it's finished
3264 * writing it to the l2arc device.
3265 */
3266 if (HDR_L2_WRITING(hdr)) {
3267 arc_hdr_free_on_write(hdr, free_rdata);
3268 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3269 } else if (free_rdata) {
3270 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3271 } else {
3272 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
3273 }
3274
3275 if (free_rdata) {
3276 hdr->b_crypt_hdr.b_rabd = NULL;
3277 ARCSTAT_INCR(arcstat_raw_size, -size);
3278 } else {
3279 hdr->b_l1hdr.b_pabd = NULL;
3280 }
3281
3282 if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3283 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3284
3285 ARCSTAT_INCR(arcstat_compressed_size, -size);
3286 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3287 }
3288
3289 /*
3290 * Allocate empty anonymous ARC header. The header will get its identity
3291 * assigned and buffers attached later as part of read or write operations.
3292 *
3293 * In case of read arc_read() assigns header its identify (b_dva + b_birth),
3294 * inserts it into ARC hash to become globally visible and allocates physical
3295 * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk. On disk read
3296 * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
3297 * sharing one of them with the physical ABD buffer.
3298 *
3299 * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
3300 * data. Then after compression and/or encryption arc_write_ready() allocates
3301 * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
3302 * buffer. On disk write completion arc_write_done() assigns the header its
3303 * new identity (b_dva + b_birth) and inserts into ARC hash.
3304 *
3305 * In case of partial overwrite the old data is read first as described. Then
3306 * arc_release() either allocates new anonymous ARC header and moves the ARC
3307 * buffer to it, or reuses the old ARC header by discarding its identity and
3308 * removing it from ARC hash. After buffer modification normal write process
3309 * follows as described.
3310 */
3311 static arc_buf_hdr_t *
arc_hdr_alloc(uint64_t spa,int32_t psize,int32_t lsize,boolean_t protected,enum zio_compress compression_type,uint8_t complevel,arc_buf_contents_t type)3312 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3313 boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
3314 arc_buf_contents_t type)
3315 {
3316 arc_buf_hdr_t *hdr;
3317
3318 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3319 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3320
3321 ASSERT(HDR_EMPTY(hdr));
3322 #ifdef ZFS_DEBUG
3323 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3324 #endif
3325 HDR_SET_PSIZE(hdr, psize);
3326 HDR_SET_LSIZE(hdr, lsize);
3327 hdr->b_spa = spa;
3328 hdr->b_type = type;
3329 hdr->b_flags = 0;
3330 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3331 arc_hdr_set_compress(hdr, compression_type);
3332 hdr->b_complevel = complevel;
3333 if (protected)
3334 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3335
3336 hdr->b_l1hdr.b_state = arc_anon;
3337 hdr->b_l1hdr.b_arc_access = 0;
3338 hdr->b_l1hdr.b_mru_hits = 0;
3339 hdr->b_l1hdr.b_mru_ghost_hits = 0;
3340 hdr->b_l1hdr.b_mfu_hits = 0;
3341 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
3342 hdr->b_l1hdr.b_buf = NULL;
3343
3344 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3345
3346 return (hdr);
3347 }
3348
3349 /*
3350 * Transition between the two allocation states for the arc_buf_hdr struct.
3351 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3352 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3353 * version is used when a cache buffer is only in the L2ARC in order to reduce
3354 * memory usage.
3355 */
3356 static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)3357 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3358 {
3359 ASSERT(HDR_HAS_L2HDR(hdr));
3360
3361 arc_buf_hdr_t *nhdr;
3362 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3363
3364 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3365 (old == hdr_l2only_cache && new == hdr_full_cache));
3366
3367 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3368
3369 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3370 buf_hash_remove(hdr);
3371
3372 memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
3373
3374 if (new == hdr_full_cache) {
3375 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3376 /*
3377 * arc_access and arc_change_state need to be aware that a
3378 * header has just come out of L2ARC, so we set its state to
3379 * l2c_only even though it's about to change.
3380 */
3381 nhdr->b_l1hdr.b_state = arc_l2c_only;
3382
3383 /* Verify previous threads set to NULL before freeing */
3384 ASSERT0P(nhdr->b_l1hdr.b_pabd);
3385 ASSERT(!HDR_HAS_RABD(hdr));
3386 } else {
3387 ASSERT0P(hdr->b_l1hdr.b_buf);
3388 #ifdef ZFS_DEBUG
3389 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3390 #endif
3391
3392 /*
3393 * If we've reached here, We must have been called from
3394 * arc_evict_hdr(), as such we should have already been
3395 * removed from any ghost list we were previously on
3396 * (which protects us from racing with arc_evict_state),
3397 * thus no locking is needed during this check.
3398 */
3399 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3400
3401 /*
3402 * A buffer must not be moved into the arc_l2c_only
3403 * state if it's not finished being written out to the
3404 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3405 * might try to be accessed, even though it was removed.
3406 */
3407 VERIFY(!HDR_L2_WRITING(hdr));
3408 VERIFY0P(hdr->b_l1hdr.b_pabd);
3409 ASSERT(!HDR_HAS_RABD(hdr));
3410
3411 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3412 }
3413 /*
3414 * The header has been reallocated so we need to re-insert it into any
3415 * lists it was on.
3416 */
3417 (void) buf_hash_insert(nhdr, NULL);
3418
3419 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3420
3421 mutex_enter(&dev->l2ad_mtx);
3422
3423 /*
3424 * We must place the realloc'ed header back into the list at
3425 * the same spot. Otherwise, if it's placed earlier in the list,
3426 * l2arc_write_buffers() could find it during the function's
3427 * write phase, and try to write it out to the l2arc.
3428 */
3429 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3430 list_remove(&dev->l2ad_buflist, hdr);
3431
3432 mutex_exit(&dev->l2ad_mtx);
3433
3434 /*
3435 * Since we're using the pointer address as the tag when
3436 * incrementing and decrementing the l2ad_alloc refcount, we
3437 * must remove the old pointer (that we're about to destroy) and
3438 * add the new pointer to the refcount. Otherwise we'd remove
3439 * the wrong pointer address when calling arc_hdr_destroy() later.
3440 */
3441
3442 (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
3443 arc_hdr_size(hdr), hdr);
3444 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
3445 arc_hdr_size(nhdr), nhdr);
3446
3447 buf_discard_identity(hdr);
3448 kmem_cache_free(old, hdr);
3449
3450 return (nhdr);
3451 }
3452
3453 /*
3454 * This function is used by the send / receive code to convert a newly
3455 * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3456 * is also used to allow the root objset block to be updated without altering
3457 * its embedded MACs. Both block types will always be uncompressed so we do not
3458 * have to worry about compression type or psize.
3459 */
3460 void
arc_convert_to_raw(arc_buf_t * buf,uint64_t dsobj,boolean_t byteorder,dmu_object_type_t ot,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac)3461 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3462 dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3463 const uint8_t *mac)
3464 {
3465 arc_buf_hdr_t *hdr = buf->b_hdr;
3466
3467 ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3468 ASSERT(HDR_HAS_L1HDR(hdr));
3469 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3470
3471 buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3472 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3473 hdr->b_crypt_hdr.b_dsobj = dsobj;
3474 hdr->b_crypt_hdr.b_ot = ot;
3475 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3476 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3477 if (!arc_hdr_has_uncompressed_buf(hdr))
3478 arc_cksum_free(hdr);
3479
3480 if (salt != NULL)
3481 memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
3482 if (iv != NULL)
3483 memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
3484 if (mac != NULL)
3485 memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
3486 }
3487
3488 /*
3489 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3490 * The buf is returned thawed since we expect the consumer to modify it.
3491 */
3492 arc_buf_t *
arc_alloc_buf(spa_t * spa,const void * tag,arc_buf_contents_t type,int32_t size)3493 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
3494 int32_t size)
3495 {
3496 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3497 B_FALSE, ZIO_COMPRESS_OFF, 0, type);
3498
3499 arc_buf_t *buf = NULL;
3500 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
3501 B_FALSE, B_FALSE, &buf));
3502 arc_buf_thaw(buf);
3503
3504 return (buf);
3505 }
3506
3507 /*
3508 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3509 * for bufs containing metadata.
3510 */
3511 arc_buf_t *
arc_alloc_compressed_buf(spa_t * spa,const void * tag,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)3512 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
3513 uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
3514 {
3515 ASSERT3U(lsize, >, 0);
3516 ASSERT3U(lsize, >=, psize);
3517 ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3518 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3519
3520 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3521 B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
3522
3523 arc_buf_t *buf = NULL;
3524 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
3525 B_TRUE, B_FALSE, B_FALSE, &buf));
3526 arc_buf_thaw(buf);
3527
3528 /*
3529 * To ensure that the hdr has the correct data in it if we call
3530 * arc_untransform() on this buf before it's been written to disk,
3531 * it's easiest if we just set up sharing between the buf and the hdr.
3532 */
3533 arc_share_buf(hdr, buf);
3534
3535 return (buf);
3536 }
3537
3538 arc_buf_t *
arc_alloc_raw_buf(spa_t * spa,const void * tag,uint64_t dsobj,boolean_t byteorder,const uint8_t * salt,const uint8_t * iv,const uint8_t * mac,dmu_object_type_t ot,uint64_t psize,uint64_t lsize,enum zio_compress compression_type,uint8_t complevel)3539 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
3540 boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
3541 const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3542 enum zio_compress compression_type, uint8_t complevel)
3543 {
3544 arc_buf_hdr_t *hdr;
3545 arc_buf_t *buf;
3546 arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3547 ARC_BUFC_METADATA : ARC_BUFC_DATA;
3548
3549 ASSERT3U(lsize, >, 0);
3550 ASSERT3U(lsize, >=, psize);
3551 ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3552 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3553
3554 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3555 compression_type, complevel, type);
3556
3557 hdr->b_crypt_hdr.b_dsobj = dsobj;
3558 hdr->b_crypt_hdr.b_ot = ot;
3559 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3560 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3561 memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
3562 memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
3563 memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
3564
3565 /*
3566 * This buffer will be considered encrypted even if the ot is not an
3567 * encrypted type. It will become authenticated instead in
3568 * arc_write_ready().
3569 */
3570 buf = NULL;
3571 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
3572 B_FALSE, B_FALSE, &buf));
3573 arc_buf_thaw(buf);
3574
3575 return (buf);
3576 }
3577
3578 static void
l2arc_hdr_arcstats_update(arc_buf_hdr_t * hdr,boolean_t incr,boolean_t state_only)3579 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
3580 boolean_t state_only)
3581 {
3582 uint64_t lsize = HDR_GET_LSIZE(hdr);
3583 uint64_t psize = HDR_GET_PSIZE(hdr);
3584 uint64_t asize = HDR_GET_L2SIZE(hdr);
3585 arc_buf_contents_t type = hdr->b_type;
3586 int64_t lsize_s;
3587 int64_t psize_s;
3588 int64_t asize_s;
3589
3590 /* For L2 we expect the header's b_l2size to be valid */
3591 ASSERT3U(asize, >=, psize);
3592
3593 if (incr) {
3594 lsize_s = lsize;
3595 psize_s = psize;
3596 asize_s = asize;
3597 } else {
3598 lsize_s = -lsize;
3599 psize_s = -psize;
3600 asize_s = -asize;
3601 }
3602
3603 /* If the buffer is a prefetch, count it as such. */
3604 if (HDR_PREFETCH(hdr)) {
3605 ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
3606 } else {
3607 /*
3608 * We use the value stored in the L2 header upon initial
3609 * caching in L2ARC. This value will be updated in case
3610 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
3611 * metadata (log entry) cannot currently be updated. Having
3612 * the ARC state in the L2 header solves the problem of a
3613 * possibly absent L1 header (apparent in buffers restored
3614 * from persistent L2ARC).
3615 */
3616 switch (hdr->b_l2hdr.b_arcs_state) {
3617 case ARC_STATE_MRU_GHOST:
3618 case ARC_STATE_MRU:
3619 ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
3620 break;
3621 case ARC_STATE_MFU_GHOST:
3622 case ARC_STATE_MFU:
3623 ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
3624 break;
3625 default:
3626 break;
3627 }
3628 }
3629
3630 if (state_only)
3631 return;
3632
3633 ARCSTAT_INCR(arcstat_l2_psize, psize_s);
3634 ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
3635
3636 switch (type) {
3637 case ARC_BUFC_DATA:
3638 ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
3639 break;
3640 case ARC_BUFC_METADATA:
3641 ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
3642 break;
3643 default:
3644 break;
3645 }
3646 }
3647
3648
3649 static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t * hdr)3650 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3651 {
3652 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3653 l2arc_dev_t *dev = l2hdr->b_dev;
3654
3655 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3656 ASSERT(HDR_HAS_L2HDR(hdr));
3657
3658 list_remove(&dev->l2ad_buflist, hdr);
3659
3660 l2arc_hdr_arcstats_decrement(hdr);
3661 if (dev->l2ad_vdev != NULL) {
3662 uint64_t asize = HDR_GET_L2SIZE(hdr);
3663 vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
3664 }
3665
3666 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3667 hdr);
3668 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3669 }
3670
3671 static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)3672 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3673 {
3674 if (HDR_HAS_L1HDR(hdr)) {
3675 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3676 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3677 }
3678 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3679 ASSERT(!HDR_IN_HASH_TABLE(hdr));
3680 boolean_t l1hdr_destroyed = B_FALSE;
3681
3682 /*
3683 * If L2_WRITING, destroy L1HDR before L2HDR (under mutex) so
3684 * arc_hdr_free_abd() can properly defer ABDs. Otherwise, destroy
3685 * L1HDR outside mutex to minimize contention.
3686 */
3687 if (HDR_HAS_L2HDR(hdr)) {
3688 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3689 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3690
3691 if (!buflist_held)
3692 mutex_enter(&dev->l2ad_mtx);
3693
3694 /*
3695 * Even though we checked this conditional above, we
3696 * need to check this again now that we have the
3697 * l2ad_mtx. This is because we could be racing with
3698 * another thread calling l2arc_evict() which might have
3699 * destroyed this header's L2 portion as we were waiting
3700 * to acquire the l2ad_mtx. If that happens, we don't
3701 * want to re-destroy the header's L2 portion.
3702 */
3703 if (HDR_HAS_L2HDR(hdr)) {
3704 if (HDR_L2_WRITING(hdr)) {
3705 l1hdr_destroyed = B_TRUE;
3706
3707 if (!HDR_EMPTY(hdr))
3708 buf_discard_identity(hdr);
3709
3710 if (HDR_HAS_L1HDR(hdr)) {
3711 arc_cksum_free(hdr);
3712
3713 while (hdr->b_l1hdr.b_buf != NULL)
3714 arc_buf_destroy_impl(
3715 hdr->b_l1hdr.b_buf);
3716
3717 if (hdr->b_l1hdr.b_pabd != NULL)
3718 arc_hdr_free_abd(hdr, B_FALSE);
3719
3720 if (HDR_HAS_RABD(hdr))
3721 arc_hdr_free_abd(hdr, B_TRUE);
3722 }
3723 }
3724
3725 arc_hdr_l2hdr_destroy(hdr);
3726 }
3727
3728 if (!buflist_held)
3729 mutex_exit(&dev->l2ad_mtx);
3730 }
3731
3732 if (!l1hdr_destroyed) {
3733 if (!HDR_EMPTY(hdr))
3734 buf_discard_identity(hdr);
3735
3736 if (HDR_HAS_L1HDR(hdr)) {
3737 arc_cksum_free(hdr);
3738
3739 while (hdr->b_l1hdr.b_buf != NULL)
3740 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3741
3742 if (hdr->b_l1hdr.b_pabd != NULL)
3743 arc_hdr_free_abd(hdr, B_FALSE);
3744
3745 if (HDR_HAS_RABD(hdr))
3746 arc_hdr_free_abd(hdr, B_TRUE);
3747 }
3748 }
3749
3750 ASSERT0P(hdr->b_hash_next);
3751 if (HDR_HAS_L1HDR(hdr)) {
3752 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3753 ASSERT0P(hdr->b_l1hdr.b_acb);
3754 #ifdef ZFS_DEBUG
3755 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
3756 #endif
3757 kmem_cache_free(hdr_full_cache, hdr);
3758 } else {
3759 kmem_cache_free(hdr_l2only_cache, hdr);
3760 }
3761 }
3762
3763 void
arc_buf_destroy(arc_buf_t * buf,const void * tag)3764 arc_buf_destroy(arc_buf_t *buf, const void *tag)
3765 {
3766 arc_buf_hdr_t *hdr = buf->b_hdr;
3767
3768 if (hdr->b_l1hdr.b_state == arc_anon) {
3769 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
3770 ASSERT(ARC_BUF_LAST(buf));
3771 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3772 VERIFY0(remove_reference(hdr, tag));
3773 return;
3774 }
3775
3776 kmutex_t *hash_lock = HDR_LOCK(hdr);
3777 mutex_enter(hash_lock);
3778
3779 ASSERT3P(hdr, ==, buf->b_hdr);
3780 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
3781 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3782 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3783 ASSERT3P(buf->b_data, !=, NULL);
3784
3785 arc_buf_destroy_impl(buf);
3786 (void) remove_reference(hdr, tag);
3787 mutex_exit(hash_lock);
3788 }
3789
3790 /*
3791 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3792 * state of the header is dependent on its state prior to entering this
3793 * function. The following transitions are possible:
3794 *
3795 * - arc_mru -> arc_mru_ghost
3796 * - arc_mfu -> arc_mfu_ghost
3797 * - arc_mru_ghost -> arc_l2c_only
3798 * - arc_mru_ghost -> deleted
3799 * - arc_mfu_ghost -> arc_l2c_only
3800 * - arc_mfu_ghost -> deleted
3801 * - arc_uncached -> deleted
3802 *
3803 * Return total size of evicted data buffers for eviction progress tracking.
3804 * When evicting from ghost states return logical buffer size to make eviction
3805 * progress at the same (or at least comparable) rate as from non-ghost states.
3806 *
3807 * Return *real_evicted for actual ARC size reduction to wake up threads
3808 * waiting for it. For non-ghost states it includes size of evicted data
3809 * buffers (the headers are not freed there). For ghost states it includes
3810 * only the evicted headers size.
3811 */
3812 static int64_t
arc_evict_hdr(arc_buf_hdr_t * hdr,uint64_t * real_evicted)3813 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
3814 {
3815 arc_state_t *evicted_state, *state;
3816 int64_t bytes_evicted = 0;
3817
3818 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3819 ASSERT(HDR_HAS_L1HDR(hdr));
3820 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3821 ASSERT0P(hdr->b_l1hdr.b_buf);
3822 ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3823
3824 *real_evicted = 0;
3825 state = hdr->b_l1hdr.b_state;
3826 if (GHOST_STATE(state)) {
3827
3828 /*
3829 * l2arc_write_buffers() relies on a header's L1 portion
3830 * (i.e. its b_pabd field) during it's write phase.
3831 * Thus, we cannot push a header onto the arc_l2c_only
3832 * state (removing its L1 piece) until the header is
3833 * done being written to the l2arc.
3834 */
3835 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3836 ARCSTAT_BUMP(arcstat_evict_l2_skip);
3837 return (bytes_evicted);
3838 }
3839
3840 ARCSTAT_BUMP(arcstat_deleted);
3841 bytes_evicted += HDR_GET_LSIZE(hdr);
3842
3843 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3844
3845 if (HDR_HAS_L2HDR(hdr)) {
3846 ASSERT0P(hdr->b_l1hdr.b_pabd);
3847 ASSERT(!HDR_HAS_RABD(hdr));
3848 /*
3849 * This buffer is cached on the 2nd Level ARC;
3850 * don't destroy the header.
3851 */
3852 arc_change_state(arc_l2c_only, hdr);
3853 /*
3854 * dropping from L1+L2 cached to L2-only,
3855 * realloc to remove the L1 header.
3856 */
3857 (void) arc_hdr_realloc(hdr, hdr_full_cache,
3858 hdr_l2only_cache);
3859 *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
3860 } else {
3861 arc_change_state(arc_anon, hdr);
3862 arc_hdr_destroy(hdr);
3863 *real_evicted += HDR_FULL_SIZE;
3864 }
3865 return (bytes_evicted);
3866 }
3867
3868 ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
3869 evicted_state = (state == arc_uncached) ? arc_anon :
3870 ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
3871
3872 /* prefetch buffers have a minimum lifespan */
3873 uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3874 arc_min_prescient_prefetch : arc_min_prefetch;
3875 if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3876 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime) {
3877 ARCSTAT_BUMP(arcstat_evict_skip);
3878 return (bytes_evicted);
3879 }
3880
3881 if (HDR_HAS_L2HDR(hdr)) {
3882 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3883 } else {
3884 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3885 ARCSTAT_INCR(arcstat_evict_l2_eligible,
3886 HDR_GET_LSIZE(hdr));
3887
3888 switch (state->arcs_state) {
3889 case ARC_STATE_MRU:
3890 ARCSTAT_INCR(
3891 arcstat_evict_l2_eligible_mru,
3892 HDR_GET_LSIZE(hdr));
3893 break;
3894 case ARC_STATE_MFU:
3895 ARCSTAT_INCR(
3896 arcstat_evict_l2_eligible_mfu,
3897 HDR_GET_LSIZE(hdr));
3898 break;
3899 default:
3900 break;
3901 }
3902 } else {
3903 ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3904 HDR_GET_LSIZE(hdr));
3905 }
3906 }
3907
3908 bytes_evicted += arc_hdr_size(hdr);
3909 *real_evicted += arc_hdr_size(hdr);
3910
3911 /*
3912 * If this hdr is being evicted and has a compressed buffer then we
3913 * discard it here before we change states. This ensures that the
3914 * accounting is updated correctly in arc_free_data_impl().
3915 */
3916 if (hdr->b_l1hdr.b_pabd != NULL)
3917 arc_hdr_free_abd(hdr, B_FALSE);
3918
3919 if (HDR_HAS_RABD(hdr))
3920 arc_hdr_free_abd(hdr, B_TRUE);
3921
3922 arc_change_state(evicted_state, hdr);
3923 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3924 if (evicted_state == arc_anon) {
3925 arc_hdr_destroy(hdr);
3926 *real_evicted += HDR_FULL_SIZE;
3927 } else {
3928 ASSERT(HDR_IN_HASH_TABLE(hdr));
3929 }
3930
3931 return (bytes_evicted);
3932 }
3933
3934 static void
arc_set_need_free(void)3935 arc_set_need_free(void)
3936 {
3937 ASSERT(MUTEX_HELD(&arc_evict_lock));
3938 int64_t remaining = arc_free_memory() - arc_sys_free / 2;
3939 arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
3940 if (aw == NULL) {
3941 arc_need_free = MAX(-remaining, 0);
3942 } else {
3943 arc_need_free =
3944 MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
3945 }
3946 }
3947
3948 static uint64_t
arc_evict_state_impl(multilist_t * ml,int idx,arc_buf_hdr_t * marker,uint64_t spa,uint64_t bytes,boolean_t * more)3949 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3950 uint64_t spa, uint64_t bytes, boolean_t *more)
3951 {
3952 multilist_sublist_t *mls;
3953 uint64_t bytes_evicted = 0, real_evicted = 0;
3954 arc_buf_hdr_t *hdr;
3955 kmutex_t *hash_lock;
3956 uint_t evict_count = zfs_arc_evict_batch_limit;
3957
3958 ASSERT3P(marker, !=, NULL);
3959
3960 mls = multilist_sublist_lock_idx(ml, idx);
3961
3962 for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
3963 hdr = multilist_sublist_prev(mls, marker)) {
3964 if ((evict_count == 0) || (bytes_evicted >= bytes))
3965 break;
3966
3967 /*
3968 * To keep our iteration location, move the marker
3969 * forward. Since we're not holding hdr's hash lock, we
3970 * must be very careful and not remove 'hdr' from the
3971 * sublist. Otherwise, other consumers might mistake the
3972 * 'hdr' as not being on a sublist when they call the
3973 * multilist_link_active() function (they all rely on
3974 * the hash lock protecting concurrent insertions and
3975 * removals). multilist_sublist_move_forward() was
3976 * specifically implemented to ensure this is the case
3977 * (only 'marker' will be removed and re-inserted).
3978 */
3979 multilist_sublist_move_forward(mls, marker);
3980
3981 /*
3982 * The only case where the b_spa field should ever be
3983 * zero, is the marker headers inserted by
3984 * arc_evict_state(). It's possible for multiple threads
3985 * to be calling arc_evict_state() concurrently (e.g.
3986 * dsl_pool_close() and zio_inject_fault()), so we must
3987 * skip any markers we see from these other threads.
3988 */
3989 if (hdr->b_spa == 0)
3990 continue;
3991
3992 /* we're only interested in evicting buffers of a certain spa */
3993 if (spa != 0 && hdr->b_spa != spa) {
3994 ARCSTAT_BUMP(arcstat_evict_skip);
3995 continue;
3996 }
3997
3998 hash_lock = HDR_LOCK(hdr);
3999
4000 /*
4001 * We aren't calling this function from any code path
4002 * that would already be holding a hash lock, so we're
4003 * asserting on this assumption to be defensive in case
4004 * this ever changes. Without this check, it would be
4005 * possible to incorrectly increment arcstat_mutex_miss
4006 * below (e.g. if the code changed such that we called
4007 * this function with a hash lock held).
4008 */
4009 ASSERT(!MUTEX_HELD(hash_lock));
4010
4011 if (mutex_tryenter(hash_lock)) {
4012 uint64_t revicted;
4013 uint64_t evicted = arc_evict_hdr(hdr, &revicted);
4014 mutex_exit(hash_lock);
4015
4016 bytes_evicted += evicted;
4017 real_evicted += revicted;
4018
4019 /*
4020 * If evicted is zero, arc_evict_hdr() must have
4021 * decided to skip this header, don't increment
4022 * evict_count in this case.
4023 */
4024 if (evicted != 0)
4025 evict_count--;
4026
4027 } else {
4028 ARCSTAT_BUMP(arcstat_mutex_miss);
4029 }
4030 }
4031
4032 multilist_sublist_unlock(mls);
4033
4034 /* Indicate if another iteration may be productive. */
4035 if (more)
4036 *more = (hdr != NULL);
4037
4038 /*
4039 * Increment the count of evicted bytes, and wake up any threads that
4040 * are waiting for the count to reach this value. Since the list is
4041 * ordered by ascending aew_count, we pop off the beginning of the
4042 * list until we reach the end, or a waiter that's past the current
4043 * "count". Doing this outside the loop reduces the number of times
4044 * we need to acquire the global arc_evict_lock.
4045 *
4046 * Only wake when there's sufficient free memory in the system
4047 * (specifically, arc_sys_free/2, which by default is a bit more than
4048 * 1/64th of RAM). See the comments in arc_wait_for_eviction().
4049 */
4050 mutex_enter(&arc_evict_lock);
4051 arc_evict_count += real_evicted;
4052
4053 if (arc_free_memory() > arc_sys_free / 2) {
4054 arc_evict_waiter_t *aw;
4055 while ((aw = list_head(&arc_evict_waiters)) != NULL &&
4056 aw->aew_count <= arc_evict_count) {
4057 list_remove(&arc_evict_waiters, aw);
4058 cv_signal(&aw->aew_cv);
4059 }
4060 }
4061 arc_set_need_free();
4062 mutex_exit(&arc_evict_lock);
4063
4064 return (bytes_evicted);
4065 }
4066
4067 static arc_buf_hdr_t *
arc_state_alloc_marker(void)4068 arc_state_alloc_marker(void)
4069 {
4070 arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4071
4072 /*
4073 * A b_spa of 0 is used to indicate that this header is
4074 * a marker. This fact is used in arc_evict_state_impl().
4075 */
4076 marker->b_spa = 0;
4077
4078 return (marker);
4079 }
4080
4081 static void
arc_state_free_marker(arc_buf_hdr_t * marker)4082 arc_state_free_marker(arc_buf_hdr_t *marker)
4083 {
4084 kmem_cache_free(hdr_full_cache, marker);
4085 }
4086
4087 /*
4088 * Allocate an array of buffer headers used as placeholders during arc state
4089 * eviction.
4090 */
4091 static arc_buf_hdr_t **
arc_state_alloc_markers(int count)4092 arc_state_alloc_markers(int count)
4093 {
4094 arc_buf_hdr_t **markers;
4095
4096 markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
4097 for (int i = 0; i < count; i++)
4098 markers[i] = arc_state_alloc_marker();
4099 return (markers);
4100 }
4101
4102 static void
arc_state_free_markers(arc_buf_hdr_t ** markers,int count)4103 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
4104 {
4105 for (int i = 0; i < count; i++)
4106 arc_state_free_marker(markers[i]);
4107 kmem_free(markers, sizeof (*markers) * count);
4108 }
4109
4110 typedef struct evict_arg {
4111 taskq_ent_t eva_tqent;
4112 multilist_t *eva_ml;
4113 arc_buf_hdr_t *eva_marker;
4114 int eva_idx;
4115 uint64_t eva_spa;
4116 uint64_t eva_bytes;
4117 uint64_t eva_evicted;
4118 } evict_arg_t;
4119
4120 static void
arc_evict_task(void * arg)4121 arc_evict_task(void *arg)
4122 {
4123 evict_arg_t *eva = arg;
4124 uint64_t total_evicted = 0;
4125 boolean_t more;
4126 uint_t batches = zfs_arc_evict_batches_limit;
4127
4128 /* Process multiple batches to amortize taskq dispatch overhead. */
4129 do {
4130 total_evicted += arc_evict_state_impl(eva->eva_ml,
4131 eva->eva_idx, eva->eva_marker, eva->eva_spa,
4132 eva->eva_bytes - total_evicted, &more);
4133 } while (total_evicted < eva->eva_bytes && --batches > 0 && more);
4134
4135 eva->eva_evicted = total_evicted;
4136 }
4137
4138 static void
arc_evict_thread_init(void)4139 arc_evict_thread_init(void)
4140 {
4141 if (zfs_arc_evict_threads == 0) {
4142 /*
4143 * Compute number of threads we want to use for eviction.
4144 *
4145 * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
4146 * default max of 16 threads at ~256 CPUs.
4147 *
4148 * However, that formula goes to two threads at 4 CPUs, which
4149 * is still rather to low to be really useful, so we just go
4150 * with 1 thread at fewer than 6 cores.
4151 */
4152 if (max_ncpus < 6)
4153 zfs_arc_evict_threads = 1;
4154 else
4155 zfs_arc_evict_threads =
4156 (highbit64(max_ncpus) - 1) + max_ncpus / 32;
4157 } else if (zfs_arc_evict_threads > max_ncpus)
4158 zfs_arc_evict_threads = max_ncpus;
4159
4160 if (zfs_arc_evict_threads > 1) {
4161 arc_evict_taskq = taskq_create("arc_evict",
4162 zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
4163 TASKQ_PREPOPULATE);
4164 arc_evict_arg = kmem_zalloc(
4165 sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
4166 }
4167 }
4168
4169 /*
4170 * The minimum number of bytes we can evict at once is a block size.
4171 * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
4172 * We use this value to compute a scaling factor for the eviction tasks.
4173 */
4174 #define MIN_EVICT_SIZE (SPA_MAXBLOCKSIZE)
4175
4176 /*
4177 * Evict buffers from the given arc state, until we've removed the
4178 * specified number of bytes. Move the removed buffers to the
4179 * appropriate evict state.
4180 *
4181 * This function makes a "best effort". It skips over any buffers
4182 * it can't get a hash_lock on, and so, may not catch all candidates.
4183 * It may also return without evicting as much space as requested.
4184 *
4185 * If bytes is specified using the special value ARC_EVICT_ALL, this
4186 * will evict all available (i.e. unlocked and evictable) buffers from
4187 * the given arc state; which is used by arc_flush().
4188 */
4189 static uint64_t
arc_evict_state(arc_state_t * state,arc_buf_contents_t type,uint64_t spa,uint64_t bytes)4190 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
4191 uint64_t bytes)
4192 {
4193 uint64_t total_evicted = 0;
4194 multilist_t *ml = &state->arcs_list[type];
4195 int num_sublists;
4196 arc_buf_hdr_t **markers;
4197 evict_arg_t *eva = NULL;
4198
4199 num_sublists = multilist_get_num_sublists(ml);
4200
4201 boolean_t use_evcttq = zfs_arc_evict_threads > 1;
4202
4203 /*
4204 * If we've tried to evict from each sublist, made some
4205 * progress, but still have not hit the target number of bytes
4206 * to evict, we want to keep trying. The markers allow us to
4207 * pick up where we left off for each individual sublist, rather
4208 * than starting from the tail each time.
4209 */
4210 if (zthr_iscurthread(arc_evict_zthr)) {
4211 markers = arc_state_evict_markers;
4212 ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
4213 } else {
4214 markers = arc_state_alloc_markers(num_sublists);
4215 }
4216 for (int i = 0; i < num_sublists; i++) {
4217 multilist_sublist_t *mls;
4218
4219 mls = multilist_sublist_lock_idx(ml, i);
4220 multilist_sublist_insert_tail(mls, markers[i]);
4221 multilist_sublist_unlock(mls);
4222 }
4223
4224 if (use_evcttq) {
4225 if (zthr_iscurthread(arc_evict_zthr))
4226 eva = arc_evict_arg;
4227 else
4228 eva = kmem_alloc(sizeof (evict_arg_t) *
4229 zfs_arc_evict_threads, KM_NOSLEEP);
4230 if (eva) {
4231 for (int i = 0; i < zfs_arc_evict_threads; i++) {
4232 taskq_init_ent(&eva[i].eva_tqent);
4233 eva[i].eva_ml = ml;
4234 eva[i].eva_spa = spa;
4235 }
4236 } else {
4237 /*
4238 * Fall back to the regular single evict if it is not
4239 * possible to allocate memory for the taskq entries.
4240 */
4241 use_evcttq = B_FALSE;
4242 }
4243 }
4244
4245 /*
4246 * Start eviction using a randomly selected sublist, this is to try and
4247 * evenly balance eviction across all sublists. Always starting at the
4248 * same sublist (e.g. index 0) would cause evictions to favor certain
4249 * sublists over others.
4250 */
4251 uint64_t scan_evicted = 0;
4252 int sublists_left = num_sublists;
4253 int sublist_idx = multilist_get_random_index(ml);
4254
4255 /*
4256 * While we haven't hit our target number of bytes to evict, or
4257 * we're evicting all available buffers.
4258 */
4259 while (total_evicted < bytes) {
4260 uint64_t evict = MIN_EVICT_SIZE;
4261 uint_t ntasks = zfs_arc_evict_threads;
4262
4263 if (use_evcttq) {
4264 if (sublists_left < ntasks)
4265 ntasks = sublists_left;
4266
4267 if (ntasks < 2)
4268 use_evcttq = B_FALSE;
4269 }
4270
4271 if (use_evcttq) {
4272 uint64_t left = bytes - total_evicted;
4273
4274 if (bytes == ARC_EVICT_ALL) {
4275 evict = bytes;
4276 } else if (left >= ntasks * MIN_EVICT_SIZE) {
4277 evict = DIV_ROUND_UP(left, ntasks);
4278 } else {
4279 ntasks = left / MIN_EVICT_SIZE;
4280 if (ntasks < 2)
4281 use_evcttq = B_FALSE;
4282 else
4283 evict = DIV_ROUND_UP(left, ntasks);
4284 }
4285 }
4286
4287 for (int i = 0; sublists_left > 0; i++, sublist_idx++,
4288 sublists_left--) {
4289 uint64_t bytes_evicted;
4290
4291 /* we've reached the end, wrap to the beginning */
4292 if (sublist_idx >= num_sublists)
4293 sublist_idx = 0;
4294
4295 if (use_evcttq) {
4296 if (i == ntasks)
4297 break;
4298
4299 eva[i].eva_marker = markers[sublist_idx];
4300 eva[i].eva_idx = sublist_idx;
4301 eva[i].eva_bytes = evict;
4302
4303 taskq_dispatch_ent(arc_evict_taskq,
4304 arc_evict_task, &eva[i], 0,
4305 &eva[i].eva_tqent);
4306
4307 continue;
4308 }
4309
4310 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4311 markers[sublist_idx], spa, bytes - total_evicted,
4312 NULL);
4313
4314 scan_evicted += bytes_evicted;
4315 total_evicted += bytes_evicted;
4316
4317 if (total_evicted < bytes)
4318 kpreempt(KPREEMPT_SYNC);
4319 else
4320 break;
4321 }
4322
4323 if (use_evcttq) {
4324 taskq_wait(arc_evict_taskq);
4325
4326 for (int i = 0; i < ntasks; i++) {
4327 scan_evicted += eva[i].eva_evicted;
4328 total_evicted += eva[i].eva_evicted;
4329 }
4330 }
4331
4332 /*
4333 * If we scanned all sublists and didn't evict anything, we
4334 * have no reason to believe we'll evict more during another
4335 * scan, so break the loop.
4336 */
4337 if (scan_evicted == 0 && sublists_left == 0) {
4338 /* This isn't possible, let's make that obvious */
4339 ASSERT3S(bytes, !=, 0);
4340
4341 /*
4342 * When bytes is ARC_EVICT_ALL, the only way to
4343 * break the loop is when scan_evicted is zero.
4344 * In that case, we actually have evicted enough,
4345 * so we don't want to increment the kstat.
4346 */
4347 if (bytes != ARC_EVICT_ALL) {
4348 ASSERT3S(total_evicted, <, bytes);
4349 ARCSTAT_BUMP(arcstat_evict_not_enough);
4350 }
4351
4352 break;
4353 }
4354
4355 /*
4356 * If we scanned all sublists but still have more to do,
4357 * reset the counts so we can go around again.
4358 */
4359 if (sublists_left == 0) {
4360 sublists_left = num_sublists;
4361 sublist_idx = multilist_get_random_index(ml);
4362 scan_evicted = 0;
4363
4364 /*
4365 * Since we're about to reconsider all sublists,
4366 * re-enable use of the evict threads if available.
4367 */
4368 use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
4369 }
4370 }
4371
4372 if (eva != NULL && eva != arc_evict_arg)
4373 kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
4374
4375 for (int i = 0; i < num_sublists; i++) {
4376 multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
4377 multilist_sublist_remove(mls, markers[i]);
4378 multilist_sublist_unlock(mls);
4379 }
4380
4381 if (markers != arc_state_evict_markers)
4382 arc_state_free_markers(markers, num_sublists);
4383
4384 return (total_evicted);
4385 }
4386
4387 /*
4388 * Flush all "evictable" data of the given type from the arc state
4389 * specified. This will not evict any "active" buffers (i.e. referenced).
4390 *
4391 * When 'retry' is set to B_FALSE, the function will make a single pass
4392 * over the state and evict any buffers that it can. Since it doesn't
4393 * continually retry the eviction, it might end up leaving some buffers
4394 * in the ARC due to lock misses.
4395 *
4396 * When 'retry' is set to B_TRUE, the function will continually retry the
4397 * eviction until *all* evictable buffers have been removed from the
4398 * state. As a result, if concurrent insertions into the state are
4399 * allowed (e.g. if the ARC isn't shutting down), this function might
4400 * wind up in an infinite loop, continually trying to evict buffers.
4401 */
4402 static uint64_t
arc_flush_state(arc_state_t * state,uint64_t spa,arc_buf_contents_t type,boolean_t retry)4403 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4404 boolean_t retry)
4405 {
4406 uint64_t evicted = 0;
4407
4408 while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
4409 evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
4410
4411 if (!retry)
4412 break;
4413 }
4414
4415 return (evicted);
4416 }
4417
4418 /*
4419 * Evict the specified number of bytes from the state specified. This
4420 * function prevents us from trying to evict more from a state's list
4421 * than is "evictable", and to skip evicting altogether when passed a
4422 * negative value for "bytes". In contrast, arc_evict_state() will
4423 * evict everything it can, when passed a negative value for "bytes".
4424 */
4425 static uint64_t
arc_evict_impl(arc_state_t * state,arc_buf_contents_t type,int64_t bytes)4426 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
4427 {
4428 uint64_t delta;
4429
4430 if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
4431 delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
4432 bytes);
4433 return (arc_evict_state(state, type, 0, delta));
4434 }
4435
4436 return (0);
4437 }
4438
4439 /*
4440 * Adjust specified fraction, taking into account initial ghost state(s) size,
4441 * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
4442 * decreasing it, plus a balance factor, controlling the decrease rate, used
4443 * to balance metadata vs data.
4444 */
4445 static uint64_t
arc_evict_adj(uint64_t frac,uint64_t total,uint64_t up,uint64_t down,uint_t balance)4446 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
4447 uint_t balance)
4448 {
4449 if (total < 32 || up + down == 0)
4450 return (frac);
4451
4452 /*
4453 * We should not have more ghost hits than ghost size, but they may
4454 * get close. To avoid overflows below up/down should not be bigger
4455 * than 1/5 of total. But to limit maximum adjustment speed restrict
4456 * it some more.
4457 */
4458 if (up + down >= total / 16) {
4459 uint64_t scale = (up + down) / (total / 32);
4460 up /= scale;
4461 down /= scale;
4462 }
4463
4464 /* Get maximal dynamic range by choosing optimal shifts. */
4465 int s = highbit64(total);
4466 s = MIN(64 - s, 32);
4467
4468 ASSERT3U(frac, <=, 1ULL << 32);
4469 uint64_t ofrac = (1ULL << 32) - frac;
4470
4471 if (frac >= 4 * ofrac)
4472 up /= frac / (2 * ofrac + 1);
4473 up = (up << s) / (total >> (32 - s));
4474 if (ofrac >= 4 * frac)
4475 down /= ofrac / (2 * frac + 1);
4476 down = (down << s) / (total >> (32 - s));
4477 down = down * 100 / balance;
4478
4479 ASSERT3U(up, <=, (1ULL << 32) - frac);
4480 ASSERT3U(down, <=, frac);
4481 return (frac + up - down);
4482 }
4483
4484 /*
4485 * Calculate (x * multiplier / divisor) without unnecesary overflows.
4486 */
4487 static uint64_t
arc_mf(uint64_t x,uint64_t multiplier,uint64_t divisor)4488 arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor)
4489 {
4490 uint64_t q = (x / divisor);
4491 uint64_t r = (x % divisor);
4492
4493 return ((q * multiplier) + ((r * multiplier) / divisor));
4494 }
4495
4496 /*
4497 * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
4498 */
4499 static uint64_t
arc_evict(void)4500 arc_evict(void)
4501 {
4502 uint64_t bytes, total_evicted = 0;
4503 int64_t e, mrud, mrum, mfud, mfum, w;
4504 static uint64_t ogrd, ogrm, ogfd, ogfm;
4505 static uint64_t gsrd, gsrm, gsfd, gsfm;
4506 uint64_t ngrd, ngrm, ngfd, ngfm;
4507
4508 /* Get current size of ARC states we can evict from. */
4509 mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
4510 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
4511 mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
4512 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
4513 mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
4514 mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
4515 uint64_t d = mrud + mfud;
4516 uint64_t m = mrum + mfum;
4517 uint64_t t = d + m;
4518
4519 /* Get ARC ghost hits since last eviction. */
4520 ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
4521 uint64_t grd = ngrd - ogrd;
4522 ogrd = ngrd;
4523 ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
4524 uint64_t grm = ngrm - ogrm;
4525 ogrm = ngrm;
4526 ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
4527 uint64_t gfd = ngfd - ogfd;
4528 ogfd = ngfd;
4529 ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
4530 uint64_t gfm = ngfm - ogfm;
4531 ogfm = ngfm;
4532
4533 /* Adjust ARC states balance based on ghost hits. */
4534 arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
4535 grm + gfm, grd + gfd, zfs_arc_meta_balance);
4536 arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
4537 arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
4538
4539 uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4540 uint64_t ac = arc_c;
4541 int64_t wt = t - (asize - ac);
4542
4543 /*
4544 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
4545 * target is not evictable or if they go over arc_dnode_limit.
4546 */
4547 int64_t prune = 0;
4548 int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
4549 int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
4550 + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
4551 - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
4552 - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
4553 w = wt * (int64_t)(arc_meta >> 16) >> 16;
4554 if (nem > w * 3 / 4) {
4555 prune = dn / sizeof (dnode_t) *
4556 zfs_arc_dnode_reduce_percent / 100;
4557 if (nem < w && w > 4)
4558 prune = arc_mf(prune, nem - w * 3 / 4, w / 4);
4559 }
4560 if (dn > arc_dnode_limit) {
4561 prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) *
4562 zfs_arc_dnode_reduce_percent / 100);
4563 }
4564 if (prune > 0)
4565 arc_prune_async(prune);
4566
4567 /* Evict MRU metadata. */
4568 w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
4569 e = MIN((int64_t)(asize - ac), (int64_t)(mrum - w));
4570 bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
4571 total_evicted += bytes;
4572 mrum -= bytes;
4573 asize -= bytes;
4574
4575 /* Evict MFU metadata. */
4576 w = wt * (int64_t)(arc_meta >> 16) >> 16;
4577 e = MIN((int64_t)(asize - ac), (int64_t)(m - bytes - w));
4578 bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
4579 total_evicted += bytes;
4580 mfum -= bytes;
4581 asize -= bytes;
4582
4583 /* Evict MRU data. */
4584 wt -= m - total_evicted;
4585 w = wt * (int64_t)(arc_pd >> 16) >> 16;
4586 e = MIN((int64_t)(asize - ac), (int64_t)(mrud - w));
4587 bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
4588 total_evicted += bytes;
4589 mrud -= bytes;
4590 asize -= bytes;
4591
4592 /* Evict MFU data. */
4593 e = asize - ac;
4594 bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
4595 mfud -= bytes;
4596 total_evicted += bytes;
4597
4598 /*
4599 * Evict ghost lists
4600 *
4601 * Size of each state's ghost list represents how much that state
4602 * may grow by shrinking the other states. Would it need to shrink
4603 * other states to zero (that is unlikely), its ghost size would be
4604 * equal to sum of other three state sizes. But excessive ghost
4605 * size may result in false ghost hits (too far back), that may
4606 * never result in real cache hits if several states are competing.
4607 * So choose some arbitraty point of 1/2 of other state sizes.
4608 */
4609 gsrd = (mrum + mfud + mfum) / 2;
4610 e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
4611 gsrd;
4612 (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
4613
4614 gsrm = (mrud + mfud + mfum) / 2;
4615 e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
4616 gsrm;
4617 (void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
4618
4619 gsfd = (mrud + mrum + mfum) / 2;
4620 e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
4621 gsfd;
4622 (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
4623
4624 gsfm = (mrud + mrum + mfud) / 2;
4625 e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
4626 gsfm;
4627 (void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
4628
4629 return (total_evicted);
4630 }
4631
4632 static void
arc_flush_impl(uint64_t guid,boolean_t retry)4633 arc_flush_impl(uint64_t guid, boolean_t retry)
4634 {
4635 ASSERT(!retry || guid == 0);
4636
4637 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4638 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4639
4640 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4641 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4642
4643 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4644 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4645
4646 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4647 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4648
4649 (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
4650 (void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
4651 }
4652
4653 void
arc_flush(spa_t * spa,boolean_t retry)4654 arc_flush(spa_t *spa, boolean_t retry)
4655 {
4656 /*
4657 * If retry is B_TRUE, a spa must not be specified since we have
4658 * no good way to determine if all of a spa's buffers have been
4659 * evicted from an arc state.
4660 */
4661 ASSERT(!retry || spa == NULL);
4662
4663 arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
4664 }
4665
4666 static arc_async_flush_t *
arc_async_flush_add(uint64_t spa_guid,uint_t level)4667 arc_async_flush_add(uint64_t spa_guid, uint_t level)
4668 {
4669 arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
4670 af->af_spa_guid = spa_guid;
4671 af->af_cache_level = level;
4672 taskq_init_ent(&af->af_tqent);
4673 list_link_init(&af->af_node);
4674
4675 mutex_enter(&arc_async_flush_lock);
4676 list_insert_tail(&arc_async_flush_list, af);
4677 mutex_exit(&arc_async_flush_lock);
4678
4679 return (af);
4680 }
4681
4682 static void
arc_async_flush_remove(uint64_t spa_guid,uint_t level)4683 arc_async_flush_remove(uint64_t spa_guid, uint_t level)
4684 {
4685 mutex_enter(&arc_async_flush_lock);
4686 for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4687 af != NULL; af = list_next(&arc_async_flush_list, af)) {
4688 if (af->af_spa_guid == spa_guid &&
4689 af->af_cache_level == level) {
4690 list_remove(&arc_async_flush_list, af);
4691 kmem_free(af, sizeof (*af));
4692 break;
4693 }
4694 }
4695 mutex_exit(&arc_async_flush_lock);
4696 }
4697
4698 static void
arc_flush_task(void * arg)4699 arc_flush_task(void *arg)
4700 {
4701 arc_async_flush_t *af = arg;
4702 hrtime_t start_time = gethrtime();
4703 uint64_t spa_guid = af->af_spa_guid;
4704
4705 arc_flush_impl(spa_guid, B_FALSE);
4706 arc_async_flush_remove(spa_guid, af->af_cache_level);
4707
4708 uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
4709 if (elapsed > 0) {
4710 zfs_dbgmsg("spa %llu arc flushed in %llu ms",
4711 (u_longlong_t)spa_guid, (u_longlong_t)elapsed);
4712 }
4713 }
4714
4715 /*
4716 * ARC buffers use the spa's load guid and can continue to exist after
4717 * the spa_t is gone (exported). The blocks are orphaned since each
4718 * spa import has a different load guid.
4719 *
4720 * It's OK if the spa is re-imported while this asynchronous flush is
4721 * still in progress. The new spa_load_guid will be different.
4722 *
4723 * Also, arc_fini will wait for any arc_flush_task to finish.
4724 */
4725 void
arc_flush_async(spa_t * spa)4726 arc_flush_async(spa_t *spa)
4727 {
4728 uint64_t spa_guid = spa_load_guid(spa);
4729 arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1);
4730
4731 taskq_dispatch_ent(arc_flush_taskq, arc_flush_task,
4732 af, TQ_SLEEP, &af->af_tqent);
4733 }
4734
4735 /*
4736 * Check if a guid is still in-use as part of an async teardown task
4737 */
4738 boolean_t
arc_async_flush_guid_inuse(uint64_t spa_guid)4739 arc_async_flush_guid_inuse(uint64_t spa_guid)
4740 {
4741 mutex_enter(&arc_async_flush_lock);
4742 for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
4743 af != NULL; af = list_next(&arc_async_flush_list, af)) {
4744 if (af->af_spa_guid == spa_guid) {
4745 mutex_exit(&arc_async_flush_lock);
4746 return (B_TRUE);
4747 }
4748 }
4749 mutex_exit(&arc_async_flush_lock);
4750 return (B_FALSE);
4751 }
4752
4753 uint64_t
arc_reduce_target_size(uint64_t to_free)4754 arc_reduce_target_size(uint64_t to_free)
4755 {
4756 /*
4757 * Get the actual arc size. Even if we don't need it, this updates
4758 * the aggsum lower bound estimate for arc_is_overflowing().
4759 */
4760 uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
4761
4762 /*
4763 * All callers want the ARC to actually evict (at least) this much
4764 * memory. Therefore we reduce from the lower of the current size and
4765 * the target size. This way, even if arc_c is much higher than
4766 * arc_size (as can be the case after many calls to arc_freed(), we will
4767 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
4768 * will evict.
4769 */
4770 uint64_t c = arc_c;
4771 if (c > arc_c_min) {
4772 c = MIN(c, MAX(asize, arc_c_min));
4773 to_free = MIN(to_free, c - arc_c_min);
4774 arc_c = c - to_free;
4775 } else {
4776 to_free = 0;
4777 }
4778
4779 /*
4780 * Since dbuf cache size is a fraction of target ARC size, we should
4781 * notify dbuf about the reduction, which might be significant,
4782 * especially if current ARC size was much smaller than the target.
4783 */
4784 dbuf_cache_reduce_target_size();
4785
4786 /*
4787 * Whether or not we reduced the target size, request eviction if the
4788 * current size is over it now, since caller obviously wants some RAM.
4789 */
4790 if (asize > arc_c) {
4791 /* See comment in arc_evict_cb_check() on why lock+flag */
4792 mutex_enter(&arc_evict_lock);
4793 arc_evict_needed = B_TRUE;
4794 mutex_exit(&arc_evict_lock);
4795 zthr_wakeup(arc_evict_zthr);
4796 }
4797
4798 return (to_free);
4799 }
4800
4801 /*
4802 * Determine if the system is under memory pressure and is asking
4803 * to reclaim memory. A return value of B_TRUE indicates that the system
4804 * is under memory pressure and that the arc should adjust accordingly.
4805 */
4806 boolean_t
arc_reclaim_needed(void)4807 arc_reclaim_needed(void)
4808 {
4809 return (arc_available_memory() < 0);
4810 }
4811
4812 void
arc_kmem_reap_soon(void)4813 arc_kmem_reap_soon(void)
4814 {
4815 size_t i;
4816 kmem_cache_t *prev_cache = NULL;
4817 kmem_cache_t *prev_data_cache = NULL;
4818
4819 #ifdef _KERNEL
4820 #if defined(_ILP32)
4821 /*
4822 * Reclaim unused memory from all kmem caches.
4823 */
4824 kmem_reap();
4825 #endif
4826 #endif
4827
4828 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4829 #if defined(_ILP32)
4830 /* reach upper limit of cache size on 32-bit */
4831 if (zio_buf_cache[i] == NULL)
4832 break;
4833 #endif
4834 if (zio_buf_cache[i] != prev_cache) {
4835 prev_cache = zio_buf_cache[i];
4836 kmem_cache_reap_now(zio_buf_cache[i]);
4837 }
4838 if (zio_data_buf_cache[i] != prev_data_cache) {
4839 prev_data_cache = zio_data_buf_cache[i];
4840 kmem_cache_reap_now(zio_data_buf_cache[i]);
4841 }
4842 }
4843 kmem_cache_reap_now(buf_cache);
4844 kmem_cache_reap_now(hdr_full_cache);
4845 kmem_cache_reap_now(hdr_l2only_cache);
4846 kmem_cache_reap_now(zfs_btree_leaf_cache);
4847 abd_cache_reap_now();
4848 }
4849
4850 static boolean_t
arc_evict_cb_check(void * arg,zthr_t * zthr)4851 arc_evict_cb_check(void *arg, zthr_t *zthr)
4852 {
4853 (void) arg, (void) zthr;
4854
4855 #ifdef ZFS_DEBUG
4856 /*
4857 * This is necessary in order to keep the kstat information
4858 * up to date for tools that display kstat data such as the
4859 * mdb ::arc dcmd and the Linux crash utility. These tools
4860 * typically do not call kstat's update function, but simply
4861 * dump out stats from the most recent update. Without
4862 * this call, these commands may show stale stats for the
4863 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4864 * with this call, the data might be out of date if the
4865 * evict thread hasn't been woken recently; but that should
4866 * suffice. The arc_state_t structures can be queried
4867 * directly if more accurate information is needed.
4868 */
4869 if (arc_ksp != NULL)
4870 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4871 #endif
4872
4873 /*
4874 * We have to rely on arc_wait_for_eviction() to tell us when to
4875 * evict, rather than checking if we are overflowing here, so that we
4876 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
4877 * If we have become "not overflowing" since arc_wait_for_eviction()
4878 * checked, we need to wake it up. We could broadcast the CV here,
4879 * but arc_wait_for_eviction() may have not yet gone to sleep. We
4880 * would need to use a mutex to ensure that this function doesn't
4881 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
4882 * the arc_evict_lock). However, the lock ordering of such a lock
4883 * would necessarily be incorrect with respect to the zthr_lock,
4884 * which is held before this function is called, and is held by
4885 * arc_wait_for_eviction() when it calls zthr_wakeup().
4886 */
4887 if (arc_evict_needed)
4888 return (B_TRUE);
4889
4890 /*
4891 * If we have buffers in uncached state, evict them periodically.
4892 */
4893 return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
4894 zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
4895 ddi_get_lbolt() - arc_last_uncached_flush > arc_min_prefetch / 2));
4896 }
4897
4898 /*
4899 * Keep arc_size under arc_c by running arc_evict which evicts data
4900 * from the ARC.
4901 */
4902 static void
arc_evict_cb(void * arg,zthr_t * zthr)4903 arc_evict_cb(void *arg, zthr_t *zthr)
4904 {
4905 (void) arg;
4906
4907 uint64_t evicted = 0;
4908 fstrans_cookie_t cookie = spl_fstrans_mark();
4909
4910 /* Always try to evict from uncached state. */
4911 arc_last_uncached_flush = ddi_get_lbolt();
4912 evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
4913 evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
4914
4915 /* Evict from other states only if told to. */
4916 if (arc_evict_needed)
4917 evicted += arc_evict();
4918
4919 /*
4920 * If evicted is zero, we couldn't evict anything
4921 * via arc_evict(). This could be due to hash lock
4922 * collisions, but more likely due to the majority of
4923 * arc buffers being unevictable. Therefore, even if
4924 * arc_size is above arc_c, another pass is unlikely to
4925 * be helpful and could potentially cause us to enter an
4926 * infinite loop. Additionally, zthr_iscancelled() is
4927 * checked here so that if the arc is shutting down, the
4928 * broadcast will wake any remaining arc evict waiters.
4929 *
4930 * Note we cancel using zthr instead of arc_evict_zthr
4931 * because the latter may not yet be initializd when the
4932 * callback is first invoked.
4933 */
4934 mutex_enter(&arc_evict_lock);
4935 arc_evict_needed = !zthr_iscancelled(zthr) &&
4936 evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
4937 if (!arc_evict_needed) {
4938 /*
4939 * We're either no longer overflowing, or we
4940 * can't evict anything more, so we should wake
4941 * arc_get_data_impl() sooner.
4942 */
4943 arc_evict_waiter_t *aw;
4944 while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
4945 cv_signal(&aw->aew_cv);
4946 }
4947 arc_set_need_free();
4948 }
4949 mutex_exit(&arc_evict_lock);
4950 spl_fstrans_unmark(cookie);
4951 }
4952
4953 static boolean_t
arc_reap_cb_check(void * arg,zthr_t * zthr)4954 arc_reap_cb_check(void *arg, zthr_t *zthr)
4955 {
4956 (void) arg, (void) zthr;
4957
4958 int64_t free_memory = arc_available_memory();
4959 static int reap_cb_check_counter = 0;
4960
4961 /*
4962 * If a kmem reap is already active, don't schedule more. We must
4963 * check for this because kmem_cache_reap_soon() won't actually
4964 * block on the cache being reaped (this is to prevent callers from
4965 * becoming implicitly blocked by a system-wide kmem reap -- which,
4966 * on a system with many, many full magazines, can take minutes).
4967 */
4968 if (!kmem_cache_reap_active() && free_memory < 0) {
4969
4970 arc_no_grow = B_TRUE;
4971 arc_warm = B_TRUE;
4972 /*
4973 * Wait at least zfs_grow_retry (default 5) seconds
4974 * before considering growing.
4975 */
4976 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4977 return (B_TRUE);
4978 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4979 arc_no_grow = B_TRUE;
4980 } else if (gethrtime() >= arc_growtime) {
4981 arc_no_grow = B_FALSE;
4982 }
4983
4984 /*
4985 * Called unconditionally every 60 seconds to reclaim unused
4986 * zstd compression and decompression context. This is done
4987 * here to avoid the need for an independent thread.
4988 */
4989 if (!((reap_cb_check_counter++) % 60))
4990 zfs_zstd_cache_reap_now();
4991
4992 return (B_FALSE);
4993 }
4994
4995 /*
4996 * Keep enough free memory in the system by reaping the ARC's kmem
4997 * caches. To cause more slabs to be reapable, we may reduce the
4998 * target size of the cache (arc_c), causing the arc_evict_cb()
4999 * to free more buffers.
5000 */
5001 static void
arc_reap_cb(void * arg,zthr_t * zthr)5002 arc_reap_cb(void *arg, zthr_t *zthr)
5003 {
5004 int64_t can_free, free_memory, to_free;
5005
5006 (void) arg, (void) zthr;
5007 fstrans_cookie_t cookie = spl_fstrans_mark();
5008
5009 /*
5010 * Kick off asynchronous kmem_reap()'s of all our caches.
5011 */
5012 arc_kmem_reap_soon();
5013
5014 /*
5015 * Wait at least arc_kmem_cache_reap_retry_ms between
5016 * arc_kmem_reap_soon() calls. Without this check it is possible to
5017 * end up in a situation where we spend lots of time reaping
5018 * caches, while we're near arc_c_min. Waiting here also gives the
5019 * subsequent free memory check a chance of finding that the
5020 * asynchronous reap has already freed enough memory, and we don't
5021 * need to call arc_reduce_target_size().
5022 */
5023 delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
5024
5025 /*
5026 * Reduce the target size as needed to maintain the amount of free
5027 * memory in the system at a fraction of the arc_size (1/128th by
5028 * default). If oversubscribed (free_memory < 0) then reduce the
5029 * target arc_size by the deficit amount plus the fractional
5030 * amount. If free memory is positive but less than the fractional
5031 * amount, reduce by what is needed to hit the fractional amount.
5032 */
5033 free_memory = arc_available_memory();
5034 can_free = arc_c - arc_c_min;
5035 to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory;
5036 if (to_free > 0)
5037 arc_reduce_target_size(to_free);
5038 spl_fstrans_unmark(cookie);
5039 }
5040
5041 #ifdef _KERNEL
5042 /*
5043 * Determine the amount of memory eligible for eviction contained in the
5044 * ARC. All clean data reported by the ghost lists can always be safely
5045 * evicted. Due to arc_c_min, the same does not hold for all clean data
5046 * contained by the regular mru and mfu lists.
5047 *
5048 * In the case of the regular mru and mfu lists, we need to report as
5049 * much clean data as possible, such that evicting that same reported
5050 * data will not bring arc_size below arc_c_min. Thus, in certain
5051 * circumstances, the total amount of clean data in the mru and mfu
5052 * lists might not actually be evictable.
5053 *
5054 * The following two distinct cases are accounted for:
5055 *
5056 * 1. The sum of the amount of dirty data contained by both the mru and
5057 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5058 * is greater than or equal to arc_c_min.
5059 * (i.e. amount of dirty data >= arc_c_min)
5060 *
5061 * This is the easy case; all clean data contained by the mru and mfu
5062 * lists is evictable. Evicting all clean data can only drop arc_size
5063 * to the amount of dirty data, which is greater than arc_c_min.
5064 *
5065 * 2. The sum of the amount of dirty data contained by both the mru and
5066 * mfu lists, plus the ARC's other accounting (e.g. the anon list),
5067 * is less than arc_c_min.
5068 * (i.e. arc_c_min > amount of dirty data)
5069 *
5070 * 2.1. arc_size is greater than or equal arc_c_min.
5071 * (i.e. arc_size >= arc_c_min > amount of dirty data)
5072 *
5073 * In this case, not all clean data from the regular mru and mfu
5074 * lists is actually evictable; we must leave enough clean data
5075 * to keep arc_size above arc_c_min. Thus, the maximum amount of
5076 * evictable data from the two lists combined, is exactly the
5077 * difference between arc_size and arc_c_min.
5078 *
5079 * 2.2. arc_size is less than arc_c_min
5080 * (i.e. arc_c_min > arc_size > amount of dirty data)
5081 *
5082 * In this case, none of the data contained in the mru and mfu
5083 * lists is evictable, even if it's clean. Since arc_size is
5084 * already below arc_c_min, evicting any more would only
5085 * increase this negative difference.
5086 */
5087
5088 #endif /* _KERNEL */
5089
5090 /*
5091 * Adapt arc info given the number of bytes we are trying to add and
5092 * the state that we are coming from. This function is only called
5093 * when we are adding new content to the cache.
5094 */
5095 static void
arc_adapt(uint64_t bytes)5096 arc_adapt(uint64_t bytes)
5097 {
5098 /*
5099 * Wake reap thread if we do not have any available memory
5100 */
5101 if (arc_reclaim_needed()) {
5102 zthr_wakeup(arc_reap_zthr);
5103 return;
5104 }
5105
5106 if (arc_no_grow)
5107 return;
5108
5109 if (arc_c >= arc_c_max)
5110 return;
5111
5112 /*
5113 * If we're within (2 * maxblocksize) bytes of the target
5114 * cache size, increment the target cache size
5115 */
5116 if (aggsum_upper_bound(&arc_sums.arcstat_size) +
5117 2 * SPA_MAXBLOCKSIZE >= arc_c) {
5118 uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
5119 if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
5120 arc_c = arc_c_max;
5121 }
5122 }
5123
5124 /*
5125 * Check if ARC current size has grown past our upper thresholds.
5126 */
5127 static arc_ovf_level_t
arc_is_overflowing(boolean_t lax,boolean_t use_reserve)5128 arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
5129 {
5130 /*
5131 * We just compare the lower bound here for performance reasons. Our
5132 * primary goals are to make sure that the arc never grows without
5133 * bound, and that it can reach its maximum size. This check
5134 * accomplishes both goals. The maximum amount we could run over by is
5135 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5136 * in the ARC. In practice, that's in the tens of MB, which is low
5137 * enough to be safe.
5138 */
5139 int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
5140 zfs_max_recordsize;
5141 int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
5142 arc_dnode_limit;
5143
5144 /* Always allow at least one block of overflow. */
5145 if (arc_over < 0 && dn_over <= 0)
5146 return (ARC_OVF_NONE);
5147
5148 /* If we are under memory pressure, report severe overflow. */
5149 if (!lax)
5150 return (ARC_OVF_SEVERE);
5151
5152 /* We are not under pressure, so be more or less relaxed. */
5153 int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
5154 if (use_reserve)
5155 overflow *= 3;
5156 return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
5157 }
5158
5159 static abd_t *
arc_get_data_abd(arc_buf_hdr_t * hdr,uint64_t size,const void * tag,int alloc_flags)5160 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
5161 int alloc_flags)
5162 {
5163 arc_buf_contents_t type = arc_buf_type(hdr);
5164
5165 arc_get_data_impl(hdr, size, tag, alloc_flags);
5166 if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
5167 return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
5168 else
5169 return (abd_alloc(size, type == ARC_BUFC_METADATA));
5170 }
5171
5172 static void *
arc_get_data_buf(arc_buf_hdr_t * hdr,uint64_t size,const void * tag)5173 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
5174 {
5175 arc_buf_contents_t type = arc_buf_type(hdr);
5176
5177 arc_get_data_impl(hdr, size, tag, 0);
5178 if (type == ARC_BUFC_METADATA) {
5179 return (zio_buf_alloc(size));
5180 } else {
5181 ASSERT(type == ARC_BUFC_DATA);
5182 return (zio_data_buf_alloc(size));
5183 }
5184 }
5185
5186 /*
5187 * Wait for the specified amount of data (in bytes) to be evicted from the
5188 * ARC, and for there to be sufficient free memory in the system.
5189 * The lax argument specifies that caller does not have a specific reason
5190 * to wait, not aware of any memory pressure. Low memory handlers though
5191 * should set it to B_FALSE to wait for all required evictions to complete.
5192 * The use_reserve argument allows some callers to wait less than others
5193 * to not block critical code paths, possibly blocking other resources.
5194 */
5195 void
arc_wait_for_eviction(uint64_t amount,boolean_t lax,boolean_t use_reserve)5196 arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
5197 {
5198 switch (arc_is_overflowing(lax, use_reserve)) {
5199 case ARC_OVF_NONE:
5200 return;
5201 case ARC_OVF_SOME:
5202 /*
5203 * This is a bit racy without taking arc_evict_lock, but the
5204 * worst that can happen is we either call zthr_wakeup() extra
5205 * time due to race with other thread here, or the set flag
5206 * get cleared by arc_evict_cb(), which is unlikely due to
5207 * big hysteresis, but also not important since at this level
5208 * of overflow the eviction is purely advisory. Same time
5209 * taking the global lock here every time without waiting for
5210 * the actual eviction creates a significant lock contention.
5211 */
5212 if (!arc_evict_needed) {
5213 arc_evict_needed = B_TRUE;
5214 zthr_wakeup(arc_evict_zthr);
5215 }
5216 return;
5217 case ARC_OVF_SEVERE:
5218 default:
5219 {
5220 arc_evict_waiter_t aw;
5221 list_link_init(&aw.aew_node);
5222 cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
5223
5224 uint64_t last_count = 0;
5225 mutex_enter(&arc_evict_lock);
5226 arc_evict_waiter_t *last;
5227 if ((last = list_tail(&arc_evict_waiters)) != NULL) {
5228 last_count = last->aew_count;
5229 } else if (!arc_evict_needed) {
5230 arc_evict_needed = B_TRUE;
5231 zthr_wakeup(arc_evict_zthr);
5232 }
5233 /*
5234 * Note, the last waiter's count may be less than
5235 * arc_evict_count if we are low on memory in which
5236 * case arc_evict_state_impl() may have deferred
5237 * wakeups (but still incremented arc_evict_count).
5238 */
5239 aw.aew_count = MAX(last_count, arc_evict_count) + amount;
5240
5241 list_insert_tail(&arc_evict_waiters, &aw);
5242
5243 arc_set_need_free();
5244
5245 DTRACE_PROBE3(arc__wait__for__eviction,
5246 uint64_t, amount,
5247 uint64_t, arc_evict_count,
5248 uint64_t, aw.aew_count);
5249
5250 /*
5251 * We will be woken up either when arc_evict_count reaches
5252 * aew_count, or when the ARC is no longer overflowing and
5253 * eviction completes.
5254 * In case of "false" wakeup, we will still be on the list.
5255 */
5256 do {
5257 cv_wait(&aw.aew_cv, &arc_evict_lock);
5258 } while (list_link_active(&aw.aew_node));
5259 mutex_exit(&arc_evict_lock);
5260
5261 cv_destroy(&aw.aew_cv);
5262 }
5263 }
5264 }
5265
5266 /*
5267 * Allocate a block and return it to the caller. If we are hitting the
5268 * hard limit for the cache size, we must sleep, waiting for the eviction
5269 * thread to catch up. If we're past the target size but below the hard
5270 * limit, we'll only signal the reclaim thread and continue on.
5271 */
5272 static void
arc_get_data_impl(arc_buf_hdr_t * hdr,uint64_t size,const void * tag,int alloc_flags)5273 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
5274 int alloc_flags)
5275 {
5276 arc_adapt(size);
5277
5278 /*
5279 * If arc_size is currently overflowing, we must be adding data
5280 * faster than we are evicting. To ensure we don't compound the
5281 * problem by adding more data and forcing arc_size to grow even
5282 * further past it's target size, we wait for the eviction thread to
5283 * make some progress. We also wait for there to be sufficient free
5284 * memory in the system, as measured by arc_free_memory().
5285 *
5286 * Specifically, we wait for zfs_arc_eviction_pct percent of the
5287 * requested size to be evicted. This should be more than 100%, to
5288 * ensure that that progress is also made towards getting arc_size
5289 * under arc_c. See the comment above zfs_arc_eviction_pct.
5290 */
5291 arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
5292 B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE);
5293
5294 arc_buf_contents_t type = arc_buf_type(hdr);
5295 if (type == ARC_BUFC_METADATA) {
5296 arc_space_consume(size, ARC_SPACE_META);
5297 } else {
5298 arc_space_consume(size, ARC_SPACE_DATA);
5299 }
5300
5301 /*
5302 * Update the state size. Note that ghost states have a
5303 * "ghost size" and so don't need to be updated.
5304 */
5305 arc_state_t *state = hdr->b_l1hdr.b_state;
5306 if (!GHOST_STATE(state)) {
5307
5308 (void) zfs_refcount_add_many(&state->arcs_size[type], size,
5309 tag);
5310
5311 /*
5312 * If this is reached via arc_read, the link is
5313 * protected by the hash lock. If reached via
5314 * arc_buf_alloc, the header should not be accessed by
5315 * any other thread. And, if reached via arc_read_done,
5316 * the hash lock will protect it if it's found in the
5317 * hash table; otherwise no other thread should be
5318 * trying to [add|remove]_reference it.
5319 */
5320 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5321 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5322 (void) zfs_refcount_add_many(&state->arcs_esize[type],
5323 size, tag);
5324 }
5325 }
5326 }
5327
5328 static void
arc_free_data_abd(arc_buf_hdr_t * hdr,abd_t * abd,uint64_t size,const void * tag)5329 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
5330 const void *tag)
5331 {
5332 arc_free_data_impl(hdr, size, tag);
5333 abd_free(abd);
5334 }
5335
5336 static void
arc_free_data_buf(arc_buf_hdr_t * hdr,void * buf,uint64_t size,const void * tag)5337 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
5338 {
5339 arc_buf_contents_t type = arc_buf_type(hdr);
5340
5341 arc_free_data_impl(hdr, size, tag);
5342 if (type == ARC_BUFC_METADATA) {
5343 zio_buf_free(buf, size);
5344 } else {
5345 ASSERT(type == ARC_BUFC_DATA);
5346 zio_data_buf_free(buf, size);
5347 }
5348 }
5349
5350 /*
5351 * Free the arc data buffer.
5352 */
5353 static void
arc_free_data_impl(arc_buf_hdr_t * hdr,uint64_t size,const void * tag)5354 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
5355 {
5356 arc_state_t *state = hdr->b_l1hdr.b_state;
5357 arc_buf_contents_t type = arc_buf_type(hdr);
5358
5359 /* protected by hash lock, if in the hash table */
5360 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5361 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5362 ASSERT(state != arc_anon && state != arc_l2c_only);
5363
5364 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
5365 size, tag);
5366 }
5367 (void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
5368
5369 VERIFY3U(hdr->b_type, ==, type);
5370 if (type == ARC_BUFC_METADATA) {
5371 arc_space_return(size, ARC_SPACE_META);
5372 } else {
5373 ASSERT(type == ARC_BUFC_DATA);
5374 arc_space_return(size, ARC_SPACE_DATA);
5375 }
5376 }
5377
5378 /*
5379 * This routine is called whenever a buffer is accessed.
5380 */
5381 static void
arc_access(arc_buf_hdr_t * hdr,arc_flags_t arc_flags,boolean_t hit)5382 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
5383 {
5384 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
5385 ASSERT(HDR_HAS_L1HDR(hdr));
5386
5387 /*
5388 * Update buffer prefetch status.
5389 */
5390 boolean_t was_prefetch = HDR_PREFETCH(hdr);
5391 boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
5392 if (was_prefetch != now_prefetch) {
5393 if (was_prefetch) {
5394 ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
5395 HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
5396 prefetch);
5397 }
5398 if (HDR_HAS_L2HDR(hdr))
5399 l2arc_hdr_arcstats_decrement_state(hdr);
5400 if (was_prefetch) {
5401 arc_hdr_clear_flags(hdr,
5402 ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
5403 } else {
5404 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5405 }
5406 if (HDR_HAS_L2HDR(hdr))
5407 l2arc_hdr_arcstats_increment_state(hdr);
5408 }
5409 if (now_prefetch) {
5410 if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
5411 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5412 ARCSTAT_BUMP(arcstat_prescient_prefetch);
5413 } else {
5414 ARCSTAT_BUMP(arcstat_predictive_prefetch);
5415 }
5416 }
5417 if (arc_flags & ARC_FLAG_L2CACHE)
5418 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5419
5420 clock_t now = ddi_get_lbolt();
5421 if (hdr->b_l1hdr.b_state == arc_anon) {
5422 arc_state_t *new_state;
5423 /*
5424 * This buffer is not in the cache, and does not appear in
5425 * our "ghost" lists. Add it to the MRU or uncached state.
5426 */
5427 ASSERT0(hdr->b_l1hdr.b_arc_access);
5428 hdr->b_l1hdr.b_arc_access = now;
5429 if (HDR_UNCACHED(hdr)) {
5430 new_state = arc_uncached;
5431 DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
5432 hdr);
5433 } else {
5434 new_state = arc_mru;
5435 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5436 }
5437 arc_change_state(new_state, hdr);
5438 } else if (hdr->b_l1hdr.b_state == arc_mru) {
5439 /*
5440 * This buffer has been accessed once recently and either
5441 * its read is still in progress or it is in the cache.
5442 */
5443 if (HDR_IO_IN_PROGRESS(hdr)) {
5444 hdr->b_l1hdr.b_arc_access = now;
5445 return;
5446 }
5447 hdr->b_l1hdr.b_mru_hits++;
5448 ARCSTAT_BUMP(arcstat_mru_hits);
5449
5450 /*
5451 * If the previous access was a prefetch, then it already
5452 * handled possible promotion, so nothing more to do for now.
5453 */
5454 if (was_prefetch) {
5455 hdr->b_l1hdr.b_arc_access = now;
5456 return;
5457 }
5458
5459 /*
5460 * If more than ARC_MINTIME have passed from the previous
5461 * hit, promote the buffer to the MFU state.
5462 */
5463 if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
5464 ARC_MINTIME)) {
5465 hdr->b_l1hdr.b_arc_access = now;
5466 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5467 arc_change_state(arc_mfu, hdr);
5468 }
5469 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5470 arc_state_t *new_state;
5471 /*
5472 * This buffer has been accessed once recently, but was
5473 * evicted from the cache. Would we have bigger MRU, it
5474 * would be an MRU hit, so handle it the same way, except
5475 * we don't need to check the previous access time.
5476 */
5477 hdr->b_l1hdr.b_mru_ghost_hits++;
5478 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5479 hdr->b_l1hdr.b_arc_access = now;
5480 wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
5481 arc_hdr_size(hdr));
5482 if (was_prefetch) {
5483 new_state = arc_mru;
5484 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5485 } else {
5486 new_state = arc_mfu;
5487 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5488 }
5489 arc_change_state(new_state, hdr);
5490 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
5491 /*
5492 * This buffer has been accessed more than once and either
5493 * still in the cache or being restored from one of ghosts.
5494 */
5495 if (!HDR_IO_IN_PROGRESS(hdr)) {
5496 hdr->b_l1hdr.b_mfu_hits++;
5497 ARCSTAT_BUMP(arcstat_mfu_hits);
5498 }
5499 hdr->b_l1hdr.b_arc_access = now;
5500 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5501 /*
5502 * This buffer has been accessed more than once recently, but
5503 * has been evicted from the cache. Would we have bigger MFU
5504 * it would stay in cache, so move it back to MFU state.
5505 */
5506 hdr->b_l1hdr.b_mfu_ghost_hits++;
5507 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5508 hdr->b_l1hdr.b_arc_access = now;
5509 wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
5510 arc_hdr_size(hdr));
5511 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5512 arc_change_state(arc_mfu, hdr);
5513 } else if (hdr->b_l1hdr.b_state == arc_uncached) {
5514 /*
5515 * This buffer is uncacheable, but we got a hit. Probably
5516 * a demand read after prefetch. Nothing more to do here.
5517 */
5518 if (!HDR_IO_IN_PROGRESS(hdr))
5519 ARCSTAT_BUMP(arcstat_uncached_hits);
5520 hdr->b_l1hdr.b_arc_access = now;
5521 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5522 /*
5523 * This buffer is on the 2nd Level ARC and was not accessed
5524 * for a long time, so treat it as new and put into MRU.
5525 */
5526 hdr->b_l1hdr.b_arc_access = now;
5527 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5528 arc_change_state(arc_mru, hdr);
5529 } else {
5530 cmn_err(CE_PANIC, "invalid arc state 0x%p",
5531 hdr->b_l1hdr.b_state);
5532 }
5533 }
5534
5535 /*
5536 * This routine is called by dbuf_hold() to update the arc_access() state
5537 * which otherwise would be skipped for entries in the dbuf cache.
5538 */
5539 void
arc_buf_access(arc_buf_t * buf)5540 arc_buf_access(arc_buf_t *buf)
5541 {
5542 arc_buf_hdr_t *hdr = buf->b_hdr;
5543
5544 /*
5545 * Avoid taking the hash_lock when possible as an optimization.
5546 * The header must be checked again under the hash_lock in order
5547 * to handle the case where it is concurrently being released.
5548 */
5549 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
5550 return;
5551
5552 kmutex_t *hash_lock = HDR_LOCK(hdr);
5553 mutex_enter(hash_lock);
5554
5555 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5556 mutex_exit(hash_lock);
5557 ARCSTAT_BUMP(arcstat_access_skip);
5558 return;
5559 }
5560
5561 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5562 hdr->b_l1hdr.b_state == arc_mfu ||
5563 hdr->b_l1hdr.b_state == arc_uncached);
5564
5565 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5566 arc_access(hdr, 0, B_TRUE);
5567 mutex_exit(hash_lock);
5568
5569 ARCSTAT_BUMP(arcstat_hits);
5570 ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
5571 !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5572 }
5573
5574 /* a generic arc_read_done_func_t which you can use */
5575 void
arc_bcopy_func(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * arg)5576 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5577 arc_buf_t *buf, void *arg)
5578 {
5579 (void) zio, (void) zb, (void) bp;
5580
5581 if (buf == NULL)
5582 return;
5583
5584 memcpy(arg, buf->b_data, arc_buf_size(buf));
5585 arc_buf_destroy(buf, arg);
5586 }
5587
5588 /* a generic arc_read_done_func_t */
5589 void
arc_getbuf_func(zio_t * zio,const zbookmark_phys_t * zb,const blkptr_t * bp,arc_buf_t * buf,void * arg)5590 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5591 arc_buf_t *buf, void *arg)
5592 {
5593 (void) zb, (void) bp;
5594 arc_buf_t **bufp = arg;
5595
5596 if (buf == NULL) {
5597 ASSERT(zio == NULL || zio->io_error != 0);
5598 *bufp = NULL;
5599 } else {
5600 ASSERT(zio == NULL || zio->io_error == 0);
5601 *bufp = buf;
5602 ASSERT(buf->b_data != NULL);
5603 }
5604 }
5605
5606 static void
arc_hdr_verify(arc_buf_hdr_t * hdr,blkptr_t * bp)5607 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
5608 {
5609 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5610 ASSERT0(HDR_GET_PSIZE(hdr));
5611 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
5612 } else {
5613 if (HDR_COMPRESSION_ENABLED(hdr)) {
5614 ASSERT3U(arc_hdr_get_compress(hdr), ==,
5615 BP_GET_COMPRESS(bp));
5616 }
5617 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5618 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5619 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
5620 }
5621 }
5622
5623 static void
arc_read_done(zio_t * zio)5624 arc_read_done(zio_t *zio)
5625 {
5626 blkptr_t *bp = zio->io_bp;
5627 arc_buf_hdr_t *hdr = zio->io_private;
5628 kmutex_t *hash_lock = NULL;
5629 arc_callback_t *callback_list;
5630 arc_callback_t *acb;
5631
5632 /*
5633 * The hdr was inserted into hash-table and removed from lists
5634 * prior to starting I/O. We should find this header, since
5635 * it's in the hash table, and it should be legit since it's
5636 * not possible to evict it during the I/O. The only possible
5637 * reason for it not to be found is if we were freed during the
5638 * read.
5639 */
5640 if (HDR_IN_HASH_TABLE(hdr)) {
5641 arc_buf_hdr_t *found;
5642
5643 ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp));
5644 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5645 BP_IDENTITY(zio->io_bp)->dva_word[0]);
5646 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5647 BP_IDENTITY(zio->io_bp)->dva_word[1]);
5648
5649 found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
5650
5651 ASSERT((found == hdr &&
5652 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5653 (found == hdr && HDR_L2_READING(hdr)));
5654 ASSERT3P(hash_lock, !=, NULL);
5655 }
5656
5657 if (BP_IS_PROTECTED(bp)) {
5658 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5659 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5660 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5661 hdr->b_crypt_hdr.b_iv);
5662
5663 if (zio->io_error == 0) {
5664 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5665 void *tmpbuf;
5666
5667 tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5668 sizeof (zil_chain_t));
5669 zio_crypt_decode_mac_zil(tmpbuf,
5670 hdr->b_crypt_hdr.b_mac);
5671 abd_return_buf(zio->io_abd, tmpbuf,
5672 sizeof (zil_chain_t));
5673 } else {
5674 zio_crypt_decode_mac_bp(bp,
5675 hdr->b_crypt_hdr.b_mac);
5676 }
5677 }
5678 }
5679
5680 if (zio->io_error == 0) {
5681 /* byteswap if necessary */
5682 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5683 if (BP_GET_LEVEL(zio->io_bp) > 0) {
5684 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5685 } else {
5686 hdr->b_l1hdr.b_byteswap =
5687 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5688 }
5689 } else {
5690 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5691 }
5692 if (!HDR_L2_READING(hdr)) {
5693 hdr->b_complevel = zio->io_prop.zp_complevel;
5694 }
5695 }
5696
5697 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5698 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5699 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5700
5701 callback_list = hdr->b_l1hdr.b_acb;
5702 ASSERT3P(callback_list, !=, NULL);
5703 hdr->b_l1hdr.b_acb = NULL;
5704
5705 /*
5706 * If a read request has a callback (i.e. acb_done is not NULL), then we
5707 * make a buf containing the data according to the parameters which were
5708 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5709 * aren't needlessly decompressing the data multiple times.
5710 */
5711 int callback_cnt = 0;
5712 for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5713
5714 /* We need the last one to call below in original order. */
5715 callback_list = acb;
5716
5717 if (!acb->acb_done || acb->acb_nobuf)
5718 continue;
5719
5720 callback_cnt++;
5721
5722 if (zio->io_error != 0)
5723 continue;
5724
5725 int error = arc_buf_alloc_impl(hdr, zio->io_spa,
5726 &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
5727 acb->acb_compressed, acb->acb_noauth, B_TRUE,
5728 &acb->acb_buf);
5729
5730 /*
5731 * Assert non-speculative zios didn't fail because an
5732 * encryption key wasn't loaded
5733 */
5734 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
5735 error != EACCES);
5736
5737 /*
5738 * If we failed to decrypt, report an error now (as the zio
5739 * layer would have done if it had done the transforms).
5740 */
5741 if (error == ECKSUM) {
5742 ASSERT(BP_IS_PROTECTED(bp));
5743 error = SET_ERROR(EIO);
5744 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5745 spa_log_error(zio->io_spa, &acb->acb_zb,
5746 BP_GET_PHYSICAL_BIRTH(zio->io_bp));
5747 (void) zfs_ereport_post(
5748 FM_EREPORT_ZFS_AUTHENTICATION,
5749 zio->io_spa, NULL, &acb->acb_zb, zio, 0);
5750 }
5751 }
5752
5753 if (error != 0) {
5754 /*
5755 * Decompression or decryption failed. Set
5756 * io_error so that when we call acb_done
5757 * (below), we will indicate that the read
5758 * failed. Note that in the unusual case
5759 * where one callback is compressed and another
5760 * uncompressed, we will mark all of them
5761 * as failed, even though the uncompressed
5762 * one can't actually fail. In this case,
5763 * the hdr will not be anonymous, because
5764 * if there are multiple callbacks, it's
5765 * because multiple threads found the same
5766 * arc buf in the hash table.
5767 */
5768 zio->io_error = error;
5769 }
5770 }
5771
5772 /*
5773 * If there are multiple callbacks, we must have the hash lock,
5774 * because the only way for multiple threads to find this hdr is
5775 * in the hash table. This ensures that if there are multiple
5776 * callbacks, the hdr is not anonymous. If it were anonymous,
5777 * we couldn't use arc_buf_destroy() in the error case below.
5778 */
5779 ASSERT(callback_cnt < 2 || hash_lock != NULL);
5780
5781 if (zio->io_error == 0) {
5782 arc_hdr_verify(hdr, zio->io_bp);
5783 } else {
5784 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5785 if (hdr->b_l1hdr.b_state != arc_anon)
5786 arc_change_state(arc_anon, hdr);
5787 if (HDR_IN_HASH_TABLE(hdr))
5788 buf_hash_remove(hdr);
5789 }
5790
5791 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5792 (void) remove_reference(hdr, hdr);
5793
5794 if (hash_lock != NULL)
5795 mutex_exit(hash_lock);
5796
5797 /* execute each callback and free its structure */
5798 while ((acb = callback_list) != NULL) {
5799 if (acb->acb_done != NULL) {
5800 if (zio->io_error != 0 && acb->acb_buf != NULL) {
5801 /*
5802 * If arc_buf_alloc_impl() fails during
5803 * decompression, the buf will still be
5804 * allocated, and needs to be freed here.
5805 */
5806 arc_buf_destroy(acb->acb_buf,
5807 acb->acb_private);
5808 acb->acb_buf = NULL;
5809 }
5810 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5811 acb->acb_buf, acb->acb_private);
5812 }
5813
5814 if (acb->acb_zio_dummy != NULL) {
5815 acb->acb_zio_dummy->io_error = zio->io_error;
5816 zio_nowait(acb->acb_zio_dummy);
5817 }
5818
5819 callback_list = acb->acb_prev;
5820 if (acb->acb_wait) {
5821 mutex_enter(&acb->acb_wait_lock);
5822 acb->acb_wait_error = zio->io_error;
5823 acb->acb_wait = B_FALSE;
5824 cv_signal(&acb->acb_wait_cv);
5825 mutex_exit(&acb->acb_wait_lock);
5826 /* acb will be freed by the waiting thread. */
5827 } else {
5828 kmem_free(acb, sizeof (arc_callback_t));
5829 }
5830 }
5831 }
5832
5833 /*
5834 * Lookup the block at the specified DVA (in bp), and return the manner in
5835 * which the block is cached. A zero return indicates not cached.
5836 */
5837 int
arc_cached(spa_t * spa,const blkptr_t * bp)5838 arc_cached(spa_t *spa, const blkptr_t *bp)
5839 {
5840 arc_buf_hdr_t *hdr = NULL;
5841 kmutex_t *hash_lock = NULL;
5842 uint64_t guid = spa_load_guid(spa);
5843 int flags = 0;
5844
5845 if (BP_IS_EMBEDDED(bp))
5846 return (ARC_CACHED_EMBEDDED);
5847
5848 hdr = buf_hash_find(guid, bp, &hash_lock);
5849 if (hdr == NULL)
5850 return (0);
5851
5852 if (HDR_HAS_L1HDR(hdr)) {
5853 arc_state_t *state = hdr->b_l1hdr.b_state;
5854 /*
5855 * We switch to ensure that any future arc_state_type_t
5856 * changes are handled. This is just a shift to promote
5857 * more compile-time checking.
5858 */
5859 switch (state->arcs_state) {
5860 case ARC_STATE_ANON:
5861 break;
5862 case ARC_STATE_MRU:
5863 flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1;
5864 break;
5865 case ARC_STATE_MFU:
5866 flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1;
5867 break;
5868 case ARC_STATE_UNCACHED:
5869 /* The header is still in L1, probably not for long */
5870 flags |= ARC_CACHED_IN_L1;
5871 break;
5872 default:
5873 break;
5874 }
5875 }
5876 if (HDR_HAS_L2HDR(hdr))
5877 flags |= ARC_CACHED_IN_L2;
5878
5879 mutex_exit(hash_lock);
5880
5881 return (flags);
5882 }
5883
5884 /*
5885 * "Read" the block at the specified DVA (in bp) via the
5886 * cache. If the block is found in the cache, invoke the provided
5887 * callback immediately and return. Note that the `zio' parameter
5888 * in the callback will be NULL in this case, since no IO was
5889 * required. If the block is not in the cache pass the read request
5890 * on to the spa with a substitute callback function, so that the
5891 * requested block will be added to the cache.
5892 *
5893 * If a read request arrives for a block that has a read in-progress,
5894 * either wait for the in-progress read to complete (and return the
5895 * results); or, if this is a read with a "done" func, add a record
5896 * to the read to invoke the "done" func when the read completes,
5897 * and return; or just return.
5898 *
5899 * arc_read_done() will invoke all the requested "done" functions
5900 * for readers of this block.
5901 */
5902 int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_read_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,arc_flags_t * arc_flags,const zbookmark_phys_t * zb)5903 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
5904 arc_read_done_func_t *done, void *private, zio_priority_t priority,
5905 int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5906 {
5907 arc_buf_hdr_t *hdr = NULL;
5908 kmutex_t *hash_lock = NULL;
5909 zio_t *rzio;
5910 uint64_t guid = spa_load_guid(spa);
5911 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
5912 boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
5913 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5914 boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
5915 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5916 boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
5917 boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
5918 arc_buf_t *buf = NULL;
5919 int rc = 0;
5920 boolean_t bp_validation = B_FALSE;
5921
5922 ASSERT(!embedded_bp ||
5923 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5924 ASSERT(!BP_IS_HOLE(bp));
5925 ASSERT(!BP_IS_REDACTED(bp));
5926
5927 /*
5928 * Normally SPL_FSTRANS will already be set since kernel threads which
5929 * expect to call the DMU interfaces will set it when created. System
5930 * calls are similarly handled by setting/cleaning the bit in the
5931 * registered callback (module/os/.../zfs/zpl_*).
5932 *
5933 * External consumers such as Lustre which call the exported DMU
5934 * interfaces may not have set SPL_FSTRANS. To avoid a deadlock
5935 * on the hash_lock always set and clear the bit.
5936 */
5937 fstrans_cookie_t cookie = spl_fstrans_mark();
5938 top:
5939 if (!embedded_bp) {
5940 /*
5941 * Embedded BP's have no DVA and require no I/O to "read".
5942 * Create an anonymous arc buf to back it.
5943 */
5944 hdr = buf_hash_find(guid, bp, &hash_lock);
5945 }
5946
5947 /*
5948 * Determine if we have an L1 cache hit or a cache miss. For simplicity
5949 * we maintain encrypted data separately from compressed / uncompressed
5950 * data. If the user is requesting raw encrypted data and we don't have
5951 * that in the header we will read from disk to guarantee that we can
5952 * get it even if the encryption keys aren't loaded.
5953 */
5954 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
5955 (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
5956 boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
5957
5958 /*
5959 * Verify the block pointer contents are reasonable. This
5960 * should always be the case since the blkptr is protected by
5961 * a checksum.
5962 */
5963 if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
5964 BLK_VERIFY_LOG)) {
5965 mutex_exit(hash_lock);
5966 rc = SET_ERROR(ECKSUM);
5967 goto done;
5968 }
5969
5970 if (HDR_IO_IN_PROGRESS(hdr)) {
5971 if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
5972 mutex_exit(hash_lock);
5973 ARCSTAT_BUMP(arcstat_cached_only_in_progress);
5974 rc = SET_ERROR(ENOENT);
5975 goto done;
5976 }
5977
5978 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5979 ASSERT3P(head_zio, !=, NULL);
5980 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5981 priority == ZIO_PRIORITY_SYNC_READ) {
5982 /*
5983 * This is a sync read that needs to wait for
5984 * an in-flight async read. Request that the
5985 * zio have its priority upgraded.
5986 */
5987 zio_change_priority(head_zio, priority);
5988 DTRACE_PROBE1(arc__async__upgrade__sync,
5989 arc_buf_hdr_t *, hdr);
5990 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5991 }
5992
5993 DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
5994 arc_access(hdr, *arc_flags, B_FALSE);
5995
5996 /*
5997 * If there are multiple threads reading the same block
5998 * and that block is not yet in the ARC, then only one
5999 * thread will do the physical I/O and all other
6000 * threads will wait until that I/O completes.
6001 * Synchronous reads use the acb_wait_cv whereas nowait
6002 * reads register a callback. Both are signalled/called
6003 * in arc_read_done.
6004 *
6005 * Errors of the physical I/O may need to be propagated.
6006 * Synchronous read errors are returned here from
6007 * arc_read_done via acb_wait_error. Nowait reads
6008 * attach the acb_zio_dummy zio to pio and
6009 * arc_read_done propagates the physical I/O's io_error
6010 * to acb_zio_dummy, and thereby to pio.
6011 */
6012 arc_callback_t *acb = NULL;
6013 if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
6014 acb = kmem_zalloc(sizeof (arc_callback_t),
6015 KM_SLEEP);
6016 acb->acb_done = done;
6017 acb->acb_private = private;
6018 acb->acb_compressed = compressed_read;
6019 acb->acb_encrypted = encrypted_read;
6020 acb->acb_noauth = noauth_read;
6021 acb->acb_nobuf = no_buf;
6022 if (*arc_flags & ARC_FLAG_WAIT) {
6023 acb->acb_wait = B_TRUE;
6024 mutex_init(&acb->acb_wait_lock, NULL,
6025 MUTEX_DEFAULT, NULL);
6026 cv_init(&acb->acb_wait_cv, NULL,
6027 CV_DEFAULT, NULL);
6028 }
6029 acb->acb_zb = *zb;
6030 if (pio != NULL) {
6031 acb->acb_zio_dummy = zio_null(pio,
6032 spa, NULL, NULL, NULL, zio_flags);
6033 }
6034 acb->acb_zio_head = head_zio;
6035 acb->acb_next = hdr->b_l1hdr.b_acb;
6036 hdr->b_l1hdr.b_acb->acb_prev = acb;
6037 hdr->b_l1hdr.b_acb = acb;
6038 }
6039 mutex_exit(hash_lock);
6040
6041 ARCSTAT_BUMP(arcstat_iohits);
6042 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6043 demand, prefetch, is_data, data, metadata, iohits);
6044
6045 if (*arc_flags & ARC_FLAG_WAIT) {
6046 mutex_enter(&acb->acb_wait_lock);
6047 while (acb->acb_wait) {
6048 cv_wait(&acb->acb_wait_cv,
6049 &acb->acb_wait_lock);
6050 }
6051 rc = acb->acb_wait_error;
6052 mutex_exit(&acb->acb_wait_lock);
6053 mutex_destroy(&acb->acb_wait_lock);
6054 cv_destroy(&acb->acb_wait_cv);
6055 kmem_free(acb, sizeof (arc_callback_t));
6056 }
6057 goto out;
6058 }
6059
6060 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
6061 hdr->b_l1hdr.b_state == arc_mfu ||
6062 hdr->b_l1hdr.b_state == arc_uncached);
6063
6064 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
6065 arc_access(hdr, *arc_flags, B_TRUE);
6066
6067 if (done && !no_buf) {
6068 ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
6069
6070 /* Get a buf with the desired data in it. */
6071 rc = arc_buf_alloc_impl(hdr, spa, zb, private,
6072 encrypted_read, compressed_read, noauth_read,
6073 B_TRUE, &buf);
6074 if (rc == ECKSUM) {
6075 /*
6076 * Convert authentication and decryption errors
6077 * to EIO (and generate an ereport if needed)
6078 * before leaving the ARC.
6079 */
6080 rc = SET_ERROR(EIO);
6081 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
6082 spa_log_error(spa, zb, hdr->b_birth);
6083 (void) zfs_ereport_post(
6084 FM_EREPORT_ZFS_AUTHENTICATION,
6085 spa, NULL, zb, NULL, 0);
6086 }
6087 }
6088 if (rc != 0) {
6089 arc_buf_destroy_impl(buf);
6090 buf = NULL;
6091 (void) remove_reference(hdr, private);
6092 }
6093
6094 /* assert any errors weren't due to unloaded keys */
6095 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
6096 rc != EACCES);
6097 }
6098 mutex_exit(hash_lock);
6099 ARCSTAT_BUMP(arcstat_hits);
6100 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6101 demand, prefetch, is_data, data, metadata, hits);
6102 *arc_flags |= ARC_FLAG_CACHED;
6103 goto done;
6104 } else {
6105 uint64_t lsize = BP_GET_LSIZE(bp);
6106 uint64_t psize = BP_GET_PSIZE(bp);
6107 arc_callback_t *acb;
6108 vdev_t *vd = NULL;
6109 uint64_t addr = 0;
6110 boolean_t devw = B_FALSE;
6111 uint64_t size;
6112 abd_t *hdr_abd;
6113 int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
6114 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
6115 int config_lock;
6116 int error;
6117
6118 if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
6119 if (hash_lock != NULL)
6120 mutex_exit(hash_lock);
6121 rc = SET_ERROR(ENOENT);
6122 goto done;
6123 }
6124
6125 if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
6126 config_lock = BLK_CONFIG_HELD;
6127 } else if (hash_lock != NULL) {
6128 /*
6129 * Prevent lock order reversal
6130 */
6131 config_lock = BLK_CONFIG_NEEDED_TRY;
6132 } else {
6133 config_lock = BLK_CONFIG_NEEDED;
6134 }
6135
6136 /*
6137 * Verify the block pointer contents are reasonable. This
6138 * should always be the case since the blkptr is protected by
6139 * a checksum.
6140 */
6141 if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
6142 config_lock, BLK_VERIFY_LOG))) {
6143 if (hash_lock != NULL)
6144 mutex_exit(hash_lock);
6145 if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
6146 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
6147 bp_validation = B_TRUE;
6148 goto top;
6149 }
6150 rc = SET_ERROR(ECKSUM);
6151 goto done;
6152 }
6153
6154 if (hdr == NULL) {
6155 /*
6156 * This block is not in the cache or it has
6157 * embedded data.
6158 */
6159 arc_buf_hdr_t *exists = NULL;
6160 hdr = arc_hdr_alloc(guid, psize, lsize,
6161 BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
6162
6163 if (!embedded_bp) {
6164 hdr->b_dva = *BP_IDENTITY(bp);
6165 hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp);
6166 exists = buf_hash_insert(hdr, &hash_lock);
6167 }
6168 if (exists != NULL) {
6169 /* somebody beat us to the hash insert */
6170 mutex_exit(hash_lock);
6171 buf_discard_identity(hdr);
6172 arc_hdr_destroy(hdr);
6173 goto top; /* restart the IO request */
6174 }
6175 } else {
6176 /*
6177 * This block is in the ghost cache or encrypted data
6178 * was requested and we didn't have it. If it was
6179 * L2-only (and thus didn't have an L1 hdr),
6180 * we realloc the header to add an L1 hdr.
6181 */
6182 if (!HDR_HAS_L1HDR(hdr)) {
6183 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
6184 hdr_full_cache);
6185 }
6186
6187 if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
6188 ASSERT0P(hdr->b_l1hdr.b_pabd);
6189 ASSERT(!HDR_HAS_RABD(hdr));
6190 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6191 ASSERT0(zfs_refcount_count(
6192 &hdr->b_l1hdr.b_refcnt));
6193 ASSERT0P(hdr->b_l1hdr.b_buf);
6194 #ifdef ZFS_DEBUG
6195 ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
6196 #endif
6197 } else if (HDR_IO_IN_PROGRESS(hdr)) {
6198 /*
6199 * If this header already had an IO in progress
6200 * and we are performing another IO to fetch
6201 * encrypted data we must wait until the first
6202 * IO completes so as not to confuse
6203 * arc_read_done(). This should be very rare
6204 * and so the performance impact shouldn't
6205 * matter.
6206 */
6207 arc_callback_t *acb = kmem_zalloc(
6208 sizeof (arc_callback_t), KM_SLEEP);
6209 acb->acb_wait = B_TRUE;
6210 mutex_init(&acb->acb_wait_lock, NULL,
6211 MUTEX_DEFAULT, NULL);
6212 cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
6213 NULL);
6214 acb->acb_zio_head =
6215 hdr->b_l1hdr.b_acb->acb_zio_head;
6216 acb->acb_next = hdr->b_l1hdr.b_acb;
6217 hdr->b_l1hdr.b_acb->acb_prev = acb;
6218 hdr->b_l1hdr.b_acb = acb;
6219 mutex_exit(hash_lock);
6220 mutex_enter(&acb->acb_wait_lock);
6221 while (acb->acb_wait) {
6222 cv_wait(&acb->acb_wait_cv,
6223 &acb->acb_wait_lock);
6224 }
6225 mutex_exit(&acb->acb_wait_lock);
6226 mutex_destroy(&acb->acb_wait_lock);
6227 cv_destroy(&acb->acb_wait_cv);
6228 kmem_free(acb, sizeof (arc_callback_t));
6229 goto top;
6230 }
6231 }
6232 if (*arc_flags & ARC_FLAG_UNCACHED) {
6233 arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
6234 if (!encrypted_read)
6235 alloc_flags |= ARC_HDR_ALLOC_LINEAR;
6236 }
6237
6238 /*
6239 * Take additional reference for IO_IN_PROGRESS. It stops
6240 * arc_access() from putting this header without any buffers
6241 * and so other references but obviously nonevictable onto
6242 * the evictable list of MRU or MFU state.
6243 */
6244 add_reference(hdr, hdr);
6245 if (!embedded_bp)
6246 arc_access(hdr, *arc_flags, B_FALSE);
6247 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6248 arc_hdr_alloc_abd(hdr, alloc_flags);
6249 if (encrypted_read) {
6250 ASSERT(HDR_HAS_RABD(hdr));
6251 size = HDR_GET_PSIZE(hdr);
6252 hdr_abd = hdr->b_crypt_hdr.b_rabd;
6253 zio_flags |= ZIO_FLAG_RAW;
6254 } else {
6255 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6256 size = arc_hdr_size(hdr);
6257 hdr_abd = hdr->b_l1hdr.b_pabd;
6258
6259 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
6260 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6261 }
6262
6263 /*
6264 * For authenticated bp's, we do not ask the ZIO layer
6265 * to authenticate them since this will cause the entire
6266 * IO to fail if the key isn't loaded. Instead, we
6267 * defer authentication until arc_buf_fill(), which will
6268 * verify the data when the key is available.
6269 */
6270 if (BP_IS_AUTHENTICATED(bp))
6271 zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
6272 }
6273
6274 if (BP_IS_AUTHENTICATED(bp))
6275 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6276 if (BP_GET_LEVEL(bp) > 0)
6277 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
6278 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
6279
6280 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
6281 acb->acb_done = done;
6282 acb->acb_private = private;
6283 acb->acb_compressed = compressed_read;
6284 acb->acb_encrypted = encrypted_read;
6285 acb->acb_noauth = noauth_read;
6286 acb->acb_nobuf = no_buf;
6287 acb->acb_zb = *zb;
6288
6289 ASSERT0P(hdr->b_l1hdr.b_acb);
6290 hdr->b_l1hdr.b_acb = acb;
6291
6292 if (HDR_HAS_L2HDR(hdr) &&
6293 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
6294 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
6295 addr = hdr->b_l2hdr.b_daddr;
6296 /*
6297 * Lock out L2ARC device removal.
6298 */
6299 if (vdev_is_dead(vd) ||
6300 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6301 vd = NULL;
6302 }
6303
6304 /*
6305 * We count both async reads and scrub IOs as asynchronous so
6306 * that both can be upgraded in the event of a cache hit while
6307 * the read IO is still in-flight.
6308 */
6309 if (priority == ZIO_PRIORITY_ASYNC_READ ||
6310 priority == ZIO_PRIORITY_SCRUB)
6311 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6312 else
6313 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6314
6315 /*
6316 * At this point, we have a level 1 cache miss or a blkptr
6317 * with embedded data. Try again in L2ARC if possible.
6318 */
6319 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6320
6321 /*
6322 * Skip ARC stat bump for block pointers with embedded
6323 * data. The data are read from the blkptr itself via
6324 * decode_embedded_bp_compressed().
6325 */
6326 if (!embedded_bp) {
6327 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
6328 blkptr_t *, bp, uint64_t, lsize,
6329 zbookmark_phys_t *, zb);
6330 ARCSTAT_BUMP(arcstat_misses);
6331 ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
6332 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
6333 metadata, misses);
6334 zfs_racct_read(spa, size, 1,
6335 (*arc_flags & ARC_FLAG_UNCACHED) ?
6336 DMU_UNCACHEDIO : 0);
6337 }
6338
6339 /* Check if the spa even has l2 configured */
6340 const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
6341 spa->spa_l2cache.sav_count > 0;
6342
6343 if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
6344 /*
6345 * Read from the L2ARC if the following are true:
6346 * 1. The L2ARC vdev was previously cached.
6347 * 2. This buffer still has L2ARC metadata.
6348 * 3. This buffer isn't currently writing to the L2ARC.
6349 * 4. The L2ARC entry wasn't evicted, which may
6350 * also have invalidated the vdev.
6351 */
6352 if (HDR_HAS_L2HDR(hdr) &&
6353 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
6354 l2arc_read_callback_t *cb;
6355 abd_t *abd;
6356 uint64_t asize;
6357
6358 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6359 ARCSTAT_BUMP(arcstat_l2_hits);
6360 hdr->b_l2hdr.b_hits++;
6361
6362 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
6363 KM_SLEEP);
6364 cb->l2rcb_hdr = hdr;
6365 cb->l2rcb_bp = *bp;
6366 cb->l2rcb_zb = *zb;
6367 cb->l2rcb_flags = zio_flags;
6368
6369 /*
6370 * When Compressed ARC is disabled, but the
6371 * L2ARC block is compressed, arc_hdr_size()
6372 * will have returned LSIZE rather than PSIZE.
6373 */
6374 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
6375 !HDR_COMPRESSION_ENABLED(hdr) &&
6376 HDR_GET_PSIZE(hdr) != 0) {
6377 size = HDR_GET_PSIZE(hdr);
6378 }
6379
6380 asize = vdev_psize_to_asize(vd, size);
6381 if (asize != size) {
6382 abd = abd_alloc_for_io(asize,
6383 HDR_ISTYPE_METADATA(hdr));
6384 cb->l2rcb_abd = abd;
6385 } else {
6386 abd = hdr_abd;
6387 }
6388
6389 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
6390 addr + asize <= vd->vdev_psize -
6391 VDEV_LABEL_END_SIZE);
6392
6393 /*
6394 * l2arc read. The SCL_L2ARC lock will be
6395 * released by l2arc_read_done().
6396 * Issue a null zio if the underlying buffer
6397 * was squashed to zero size by compression.
6398 */
6399 ASSERT3U(arc_hdr_get_compress(hdr), !=,
6400 ZIO_COMPRESS_EMPTY);
6401 rzio = zio_read_phys(pio, vd, addr,
6402 asize, abd,
6403 ZIO_CHECKSUM_OFF,
6404 l2arc_read_done, cb, priority,
6405 zio_flags | ZIO_FLAG_CANFAIL |
6406 ZIO_FLAG_DONT_PROPAGATE |
6407 ZIO_FLAG_DONT_RETRY, B_FALSE);
6408 acb->acb_zio_head = rzio;
6409
6410 if (hash_lock != NULL)
6411 mutex_exit(hash_lock);
6412
6413 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6414 zio_t *, rzio);
6415 ARCSTAT_INCR(arcstat_l2_read_bytes,
6416 HDR_GET_PSIZE(hdr));
6417
6418 if (*arc_flags & ARC_FLAG_NOWAIT) {
6419 zio_nowait(rzio);
6420 goto out;
6421 }
6422
6423 ASSERT(*arc_flags & ARC_FLAG_WAIT);
6424 if (zio_wait(rzio) == 0)
6425 goto out;
6426
6427 /* l2arc read error; goto zio_read() */
6428 if (hash_lock != NULL)
6429 mutex_enter(hash_lock);
6430 } else {
6431 DTRACE_PROBE1(l2arc__miss,
6432 arc_buf_hdr_t *, hdr);
6433 ARCSTAT_BUMP(arcstat_l2_misses);
6434 if (HDR_L2_WRITING(hdr))
6435 ARCSTAT_BUMP(arcstat_l2_rw_clash);
6436 spa_config_exit(spa, SCL_L2ARC, vd);
6437 }
6438 } else {
6439 if (vd != NULL)
6440 spa_config_exit(spa, SCL_L2ARC, vd);
6441
6442 /*
6443 * Only a spa with l2 should contribute to l2
6444 * miss stats. (Including the case of having a
6445 * faulted cache device - that's also a miss.)
6446 */
6447 if (spa_has_l2) {
6448 /*
6449 * Skip ARC stat bump for block pointers with
6450 * embedded data. The data are read from the
6451 * blkptr itself via
6452 * decode_embedded_bp_compressed().
6453 */
6454 if (!embedded_bp) {
6455 DTRACE_PROBE1(l2arc__miss,
6456 arc_buf_hdr_t *, hdr);
6457 ARCSTAT_BUMP(arcstat_l2_misses);
6458 }
6459 }
6460 }
6461
6462 rzio = zio_read(pio, spa, bp, hdr_abd, size,
6463 arc_read_done, hdr, priority, zio_flags, zb);
6464 acb->acb_zio_head = rzio;
6465
6466 if (hash_lock != NULL)
6467 mutex_exit(hash_lock);
6468
6469 if (*arc_flags & ARC_FLAG_WAIT) {
6470 rc = zio_wait(rzio);
6471 goto out;
6472 }
6473
6474 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6475 zio_nowait(rzio);
6476 }
6477
6478 out:
6479 /* embedded bps don't actually go to disk */
6480 if (!embedded_bp)
6481 spa_read_history_add(spa, zb, *arc_flags);
6482 spl_fstrans_unmark(cookie);
6483 return (rc);
6484
6485 done:
6486 if (done)
6487 done(NULL, zb, bp, buf, private);
6488 if (pio && rc != 0) {
6489 zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
6490 zio->io_error = rc;
6491 zio_nowait(zio);
6492 }
6493 goto out;
6494 }
6495
6496 arc_prune_t *
arc_add_prune_callback(arc_prune_func_t * func,void * private)6497 arc_add_prune_callback(arc_prune_func_t *func, void *private)
6498 {
6499 arc_prune_t *p;
6500
6501 p = kmem_alloc(sizeof (*p), KM_SLEEP);
6502 p->p_pfunc = func;
6503 p->p_private = private;
6504 list_link_init(&p->p_node);
6505 zfs_refcount_create(&p->p_refcnt);
6506
6507 mutex_enter(&arc_prune_mtx);
6508 zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
6509 list_insert_head(&arc_prune_list, p);
6510 mutex_exit(&arc_prune_mtx);
6511
6512 return (p);
6513 }
6514
6515 void
arc_remove_prune_callback(arc_prune_t * p)6516 arc_remove_prune_callback(arc_prune_t *p)
6517 {
6518 boolean_t wait = B_FALSE;
6519 mutex_enter(&arc_prune_mtx);
6520 list_remove(&arc_prune_list, p);
6521 if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
6522 wait = B_TRUE;
6523 mutex_exit(&arc_prune_mtx);
6524
6525 /* wait for arc_prune_task to finish */
6526 if (wait)
6527 taskq_wait_outstanding(arc_prune_taskq, 0);
6528 ASSERT0(zfs_refcount_count(&p->p_refcnt));
6529 zfs_refcount_destroy(&p->p_refcnt);
6530 kmem_free(p, sizeof (*p));
6531 }
6532
6533 /*
6534 * Helper function for arc_prune_async() it is responsible for safely
6535 * handling the execution of a registered arc_prune_func_t.
6536 */
6537 static void
arc_prune_task(void * ptr)6538 arc_prune_task(void *ptr)
6539 {
6540 arc_prune_t *ap = (arc_prune_t *)ptr;
6541 arc_prune_func_t *func = ap->p_pfunc;
6542
6543 if (func != NULL)
6544 func(ap->p_adjust, ap->p_private);
6545
6546 (void) zfs_refcount_remove(&ap->p_refcnt, func);
6547 }
6548
6549 /*
6550 * Notify registered consumers they must drop holds on a portion of the ARC
6551 * buffers they reference. This provides a mechanism to ensure the ARC can
6552 * honor the metadata limit and reclaim otherwise pinned ARC buffers.
6553 *
6554 * This operation is performed asynchronously so it may be safely called
6555 * in the context of the arc_reclaim_thread(). A reference is taken here
6556 * for each registered arc_prune_t and the arc_prune_task() is responsible
6557 * for releasing it once the registered arc_prune_func_t has completed.
6558 */
6559 static void
arc_prune_async(uint64_t adjust)6560 arc_prune_async(uint64_t adjust)
6561 {
6562 arc_prune_t *ap;
6563
6564 mutex_enter(&arc_prune_mtx);
6565 for (ap = list_head(&arc_prune_list); ap != NULL;
6566 ap = list_next(&arc_prune_list, ap)) {
6567
6568 if (zfs_refcount_count(&ap->p_refcnt) >= 2)
6569 continue;
6570
6571 zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
6572 ap->p_adjust = adjust;
6573 if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
6574 ap, TQ_SLEEP) == TASKQID_INVALID) {
6575 (void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
6576 continue;
6577 }
6578 ARCSTAT_BUMP(arcstat_prune);
6579 }
6580 mutex_exit(&arc_prune_mtx);
6581 }
6582
6583 /*
6584 * Notify the arc that a block was freed, and thus will never be used again.
6585 */
6586 void
arc_freed(spa_t * spa,const blkptr_t * bp)6587 arc_freed(spa_t *spa, const blkptr_t *bp)
6588 {
6589 arc_buf_hdr_t *hdr;
6590 kmutex_t *hash_lock;
6591 uint64_t guid = spa_load_guid(spa);
6592
6593 ASSERT(!BP_IS_EMBEDDED(bp));
6594
6595 hdr = buf_hash_find(guid, bp, &hash_lock);
6596 if (hdr == NULL)
6597 return;
6598
6599 /*
6600 * We might be trying to free a block that is still doing I/O
6601 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
6602 * dmu_sync-ed block). A block may also have a reference if it is
6603 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6604 * have written the new block to its final resting place on disk but
6605 * without the dedup flag set. This would have left the hdr in the MRU
6606 * state and discoverable. When the txg finally syncs it detects that
6607 * the block was overridden in open context and issues an override I/O.
6608 * Since this is a dedup block, the override I/O will determine if the
6609 * block is already in the DDT. If so, then it will replace the io_bp
6610 * with the bp from the DDT and allow the I/O to finish. When the I/O
6611 * reaches the done callback, dbuf_write_override_done, it will
6612 * check to see if the io_bp and io_bp_override are identical.
6613 * If they are not, then it indicates that the bp was replaced with
6614 * the bp in the DDT and the override bp is freed. This allows
6615 * us to arrive here with a reference on a block that is being
6616 * freed. So if we have an I/O in progress, or a reference to
6617 * this hdr, then we don't destroy the hdr.
6618 */
6619 if (!HDR_HAS_L1HDR(hdr) ||
6620 zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6621 arc_change_state(arc_anon, hdr);
6622 arc_hdr_destroy(hdr);
6623 mutex_exit(hash_lock);
6624 } else {
6625 mutex_exit(hash_lock);
6626 }
6627
6628 }
6629
6630 /*
6631 * Release this buffer from the cache, making it an anonymous buffer. This
6632 * must be done after a read and prior to modifying the buffer contents.
6633 * If the buffer has more than one reference, we must make
6634 * a new hdr for the buffer.
6635 */
6636 void
arc_release(arc_buf_t * buf,const void * tag)6637 arc_release(arc_buf_t *buf, const void *tag)
6638 {
6639 arc_buf_hdr_t *hdr = buf->b_hdr;
6640
6641 /*
6642 * It would be nice to assert that if its DMU metadata (level >
6643 * 0 || it's the dnode file), then it must be syncing context.
6644 * But we don't know that information at this level.
6645 */
6646
6647 ASSERT(HDR_HAS_L1HDR(hdr));
6648
6649 /*
6650 * We don't grab the hash lock prior to this check, because if
6651 * the buffer's header is in the arc_anon state, it won't be
6652 * linked into the hash table.
6653 */
6654 if (hdr->b_l1hdr.b_state == arc_anon) {
6655 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6656 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6657 ASSERT(!HDR_HAS_L2HDR(hdr));
6658
6659 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
6660 ASSERT(ARC_BUF_LAST(buf));
6661 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6662 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6663
6664 hdr->b_l1hdr.b_arc_access = 0;
6665
6666 /*
6667 * If the buf is being overridden then it may already
6668 * have a hdr that is not empty.
6669 */
6670 buf_discard_identity(hdr);
6671 arc_buf_thaw(buf);
6672
6673 return;
6674 }
6675
6676 kmutex_t *hash_lock = HDR_LOCK(hdr);
6677 mutex_enter(hash_lock);
6678
6679 /*
6680 * This assignment is only valid as long as the hash_lock is
6681 * held, we must be careful not to reference state or the
6682 * b_state field after dropping the lock.
6683 */
6684 arc_state_t *state = hdr->b_l1hdr.b_state;
6685 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6686 ASSERT3P(state, !=, arc_anon);
6687 ASSERT3P(state, !=, arc_l2c_only);
6688
6689 /* this buffer is not on any list */
6690 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6691
6692 /*
6693 * Do we have more than one buf? Or L2_WRITING with unshared data?
6694 * Single-buf L2_WRITING with shared data can reuse the header since
6695 * L2ARC uses its own transformed copy.
6696 */
6697 if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf) ||
6698 (HDR_L2_WRITING(hdr) && !ARC_BUF_SHARED(buf))) {
6699 arc_buf_hdr_t *nhdr;
6700 uint64_t spa = hdr->b_spa;
6701 uint64_t psize = HDR_GET_PSIZE(hdr);
6702 uint64_t lsize = HDR_GET_LSIZE(hdr);
6703 boolean_t protected = HDR_PROTECTED(hdr);
6704 enum zio_compress compress = arc_hdr_get_compress(hdr);
6705 uint8_t complevel = hdr->b_complevel;
6706 arc_buf_contents_t type = arc_buf_type(hdr);
6707 boolean_t single_buf_l2writing = (hdr->b_l1hdr.b_buf == buf &&
6708 ARC_BUF_LAST(buf) && HDR_L2_WRITING(hdr));
6709
6710 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
6711 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6712 ASSERT(ARC_BUF_LAST(buf));
6713 }
6714
6715 /*
6716 * Pull the buffer off of this hdr and find the last buffer
6717 * in the hdr's buffer list.
6718 */
6719 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6720 EQUIV(single_buf_l2writing, lastbuf == NULL);
6721
6722 /*
6723 * If the current arc_buf_t and the hdr are sharing their data
6724 * buffer, then we must stop sharing that block.
6725 */
6726 if (!single_buf_l2writing) {
6727 if (ARC_BUF_SHARED(buf)) {
6728 ASSERT(!arc_buf_is_shared(lastbuf));
6729
6730 /*
6731 * First, sever the block sharing relationship
6732 * between buf and the arc_buf_hdr_t.
6733 */
6734 arc_unshare_buf(hdr, buf);
6735
6736 /*
6737 * Now we need to recreate the hdr's b_pabd.
6738 * Since we have lastbuf handy, we try to share
6739 * with it, but if we can't then we allocate a
6740 * new b_pabd and copy the data from buf into it
6741 */
6742 if (arc_can_share(hdr, lastbuf)) {
6743 arc_share_buf(hdr, lastbuf);
6744 } else {
6745 arc_hdr_alloc_abd(hdr, 0);
6746 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6747 buf->b_data, psize);
6748 }
6749 } else if (HDR_SHARED_DATA(hdr)) {
6750 /*
6751 * Uncompressed shared buffers are always at the
6752 * end of the list. Compressed buffers don't
6753 * have the same requirements. This makes it
6754 * hard to simply assert that the lastbuf is
6755 * shared so we rely on the hdr's compression
6756 * flags to determine if we have a compressed,
6757 * shared buffer.
6758 */
6759 ASSERT(arc_buf_is_shared(lastbuf) ||
6760 arc_hdr_get_compress(hdr) !=
6761 ZIO_COMPRESS_OFF);
6762 ASSERT(!arc_buf_is_shared(buf));
6763 }
6764 }
6765
6766 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
6767
6768 (void) zfs_refcount_remove_many(&state->arcs_size[type],
6769 arc_buf_size(buf), buf);
6770
6771 arc_cksum_verify(buf);
6772 arc_buf_unwatch(buf);
6773
6774 /* if this is the last uncompressed buf free the checksum */
6775 if (!arc_hdr_has_uncompressed_buf(hdr))
6776 arc_cksum_free(hdr);
6777
6778 if (single_buf_l2writing)
6779 VERIFY3S(remove_reference(hdr, tag), ==, 0);
6780 else
6781 VERIFY3S(remove_reference(hdr, tag), >, 0);
6782
6783 mutex_exit(hash_lock);
6784
6785 nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress,
6786 complevel, type);
6787 ASSERT0P(nhdr->b_l1hdr.b_buf);
6788 ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
6789 VERIFY3U(nhdr->b_type, ==, type);
6790 ASSERT(!HDR_SHARED_DATA(nhdr));
6791
6792 nhdr->b_l1hdr.b_buf = buf;
6793 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6794 buf->b_hdr = nhdr;
6795
6796 (void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
6797 arc_buf_size(buf), buf);
6798 } else {
6799 ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6800 /* protected by hash lock, or hdr is on arc_anon */
6801 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6802 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6803
6804 if (HDR_HAS_L2HDR(hdr)) {
6805 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6806 /* Recheck to prevent race with l2arc_evict(). */
6807 if (HDR_HAS_L2HDR(hdr))
6808 arc_hdr_l2hdr_destroy(hdr);
6809 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6810 }
6811
6812 hdr->b_l1hdr.b_mru_hits = 0;
6813 hdr->b_l1hdr.b_mru_ghost_hits = 0;
6814 hdr->b_l1hdr.b_mfu_hits = 0;
6815 hdr->b_l1hdr.b_mfu_ghost_hits = 0;
6816 arc_change_state(arc_anon, hdr);
6817 hdr->b_l1hdr.b_arc_access = 0;
6818
6819 mutex_exit(hash_lock);
6820 buf_discard_identity(hdr);
6821 arc_buf_thaw(buf);
6822 }
6823 }
6824
6825 int
arc_released(arc_buf_t * buf)6826 arc_released(arc_buf_t *buf)
6827 {
6828 return (buf->b_data != NULL &&
6829 buf->b_hdr->b_l1hdr.b_state == arc_anon);
6830 }
6831
6832 #ifdef ZFS_DEBUG
6833 int
arc_referenced(arc_buf_t * buf)6834 arc_referenced(arc_buf_t *buf)
6835 {
6836 return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6837 }
6838 #endif
6839
6840 static void
arc_write_ready(zio_t * zio)6841 arc_write_ready(zio_t *zio)
6842 {
6843 arc_write_callback_t *callback = zio->io_private;
6844 arc_buf_t *buf = callback->awcb_buf;
6845 arc_buf_hdr_t *hdr = buf->b_hdr;
6846 blkptr_t *bp = zio->io_bp;
6847 uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
6848 fstrans_cookie_t cookie = spl_fstrans_mark();
6849
6850 ASSERT(HDR_HAS_L1HDR(hdr));
6851 ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6852 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
6853
6854 /*
6855 * If we're reexecuting this zio because the pool suspended, then
6856 * cleanup any state that was previously set the first time the
6857 * callback was invoked.
6858 */
6859 if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6860 arc_cksum_free(hdr);
6861 arc_buf_unwatch(buf);
6862 if (hdr->b_l1hdr.b_pabd != NULL) {
6863 if (ARC_BUF_SHARED(buf)) {
6864 arc_unshare_buf(hdr, buf);
6865 } else {
6866 ASSERT(!arc_buf_is_shared(buf));
6867 arc_hdr_free_abd(hdr, B_FALSE);
6868 }
6869 }
6870
6871 if (HDR_HAS_RABD(hdr))
6872 arc_hdr_free_abd(hdr, B_TRUE);
6873 }
6874 ASSERT0P(hdr->b_l1hdr.b_pabd);
6875 ASSERT(!HDR_HAS_RABD(hdr));
6876 ASSERT(!HDR_SHARED_DATA(hdr));
6877 ASSERT(!arc_buf_is_shared(buf));
6878
6879 callback->awcb_ready(zio, buf, callback->awcb_private);
6880
6881 if (HDR_IO_IN_PROGRESS(hdr)) {
6882 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6883 } else {
6884 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6885 add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
6886 }
6887
6888 if (BP_IS_PROTECTED(bp)) {
6889 /* ZIL blocks are written through zio_rewrite */
6890 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6891
6892 if (BP_SHOULD_BYTESWAP(bp)) {
6893 if (BP_GET_LEVEL(bp) > 0) {
6894 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6895 } else {
6896 hdr->b_l1hdr.b_byteswap =
6897 DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6898 }
6899 } else {
6900 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6901 }
6902
6903 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
6904 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6905 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6906 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6907 hdr->b_crypt_hdr.b_iv);
6908 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6909 } else {
6910 arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
6911 }
6912
6913 /*
6914 * If this block was written for raw encryption but the zio layer
6915 * ended up only authenticating it, adjust the buffer flags now.
6916 */
6917 if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6918 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6919 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6920 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6921 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6922 } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6923 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6924 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6925 }
6926
6927 /* this must be done after the buffer flags are adjusted */
6928 arc_cksum_compute(buf);
6929
6930 enum zio_compress compress;
6931 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
6932 compress = ZIO_COMPRESS_OFF;
6933 } else {
6934 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6935 compress = BP_GET_COMPRESS(bp);
6936 }
6937 HDR_SET_PSIZE(hdr, psize);
6938 arc_hdr_set_compress(hdr, compress);
6939 hdr->b_complevel = zio->io_prop.zp_complevel;
6940
6941 if (zio->io_error != 0 || psize == 0)
6942 goto out;
6943
6944 /*
6945 * Fill the hdr with data. If the buffer is encrypted we have no choice
6946 * but to copy the data into b_radb. If the hdr is compressed, the data
6947 * we want is available from the zio, otherwise we can take it from
6948 * the buf.
6949 *
6950 * We might be able to share the buf's data with the hdr here. However,
6951 * doing so would cause the ARC to be full of linear ABDs if we write a
6952 * lot of shareable data. As a compromise, we check whether scattered
6953 * ABDs are allowed, and assume that if they are then the user wants
6954 * the ARC to be primarily filled with them regardless of the data being
6955 * written. Therefore, if they're allowed then we allocate one and copy
6956 * the data into it; otherwise, we share the data directly if we can.
6957 */
6958 if (ARC_BUF_ENCRYPTED(buf)) {
6959 ASSERT3U(psize, >, 0);
6960 ASSERT(ARC_BUF_COMPRESSED(buf));
6961 arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
6962 ARC_HDR_USE_RESERVE);
6963 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6964 } else if (!(HDR_UNCACHED(hdr) ||
6965 abd_size_alloc_linear(arc_buf_size(buf))) ||
6966 !arc_can_share(hdr, buf)) {
6967 /*
6968 * Ideally, we would always copy the io_abd into b_pabd, but the
6969 * user may have disabled compressed ARC, thus we must check the
6970 * hdr's compression setting rather than the io_bp's.
6971 */
6972 if (BP_IS_ENCRYPTED(bp)) {
6973 ASSERT3U(psize, >, 0);
6974 arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
6975 ARC_HDR_USE_RESERVE);
6976 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6977 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6978 !ARC_BUF_COMPRESSED(buf)) {
6979 ASSERT3U(psize, >, 0);
6980 arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
6981 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6982 } else {
6983 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6984 arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
6985 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6986 arc_buf_size(buf));
6987 }
6988 } else {
6989 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6990 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6991 ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
6992 ASSERT(ARC_BUF_LAST(buf));
6993
6994 arc_share_buf(hdr, buf);
6995 }
6996
6997 out:
6998 arc_hdr_verify(hdr, bp);
6999 spl_fstrans_unmark(cookie);
7000 }
7001
7002 static void
arc_write_children_ready(zio_t * zio)7003 arc_write_children_ready(zio_t *zio)
7004 {
7005 arc_write_callback_t *callback = zio->io_private;
7006 arc_buf_t *buf = callback->awcb_buf;
7007
7008 callback->awcb_children_ready(zio, buf, callback->awcb_private);
7009 }
7010
7011 static void
arc_write_done(zio_t * zio)7012 arc_write_done(zio_t *zio)
7013 {
7014 arc_write_callback_t *callback = zio->io_private;
7015 arc_buf_t *buf = callback->awcb_buf;
7016 arc_buf_hdr_t *hdr = buf->b_hdr;
7017
7018 ASSERT0P(hdr->b_l1hdr.b_acb);
7019
7020 if (zio->io_error == 0) {
7021 arc_hdr_verify(hdr, zio->io_bp);
7022
7023 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
7024 buf_discard_identity(hdr);
7025 } else {
7026 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
7027 hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp);
7028 }
7029 } else {
7030 ASSERT(HDR_EMPTY(hdr));
7031 }
7032
7033 /*
7034 * If the block to be written was all-zero or compressed enough to be
7035 * embedded in the BP, no write was performed so there will be no
7036 * dva/birth/checksum. The buffer must therefore remain anonymous
7037 * (and uncached).
7038 */
7039 if (!HDR_EMPTY(hdr)) {
7040 arc_buf_hdr_t *exists;
7041 kmutex_t *hash_lock;
7042
7043 ASSERT0(zio->io_error);
7044
7045 arc_cksum_verify(buf);
7046
7047 exists = buf_hash_insert(hdr, &hash_lock);
7048 if (exists != NULL) {
7049 /*
7050 * This can only happen if we overwrite for
7051 * sync-to-convergence, because we remove
7052 * buffers from the hash table when we arc_free().
7053 */
7054 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
7055 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7056 panic("bad overwrite, hdr=%p exists=%p",
7057 (void *)hdr, (void *)exists);
7058 ASSERT(zfs_refcount_is_zero(
7059 &exists->b_l1hdr.b_refcnt));
7060 arc_change_state(arc_anon, exists);
7061 arc_hdr_destroy(exists);
7062 mutex_exit(hash_lock);
7063 exists = buf_hash_insert(hdr, &hash_lock);
7064 ASSERT0P(exists);
7065 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
7066 /* nopwrite */
7067 ASSERT(zio->io_prop.zp_nopwrite);
7068 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
7069 panic("bad nopwrite, hdr=%p exists=%p",
7070 (void *)hdr, (void *)exists);
7071 } else {
7072 /* Dedup */
7073 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
7074 ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
7075 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
7076 ASSERT(BP_GET_DEDUP(zio->io_bp));
7077 ASSERT0(BP_GET_LEVEL(zio->io_bp));
7078 }
7079 }
7080 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7081 VERIFY3S(remove_reference(hdr, hdr), >, 0);
7082 /* if it's not anon, we are doing a scrub */
7083 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
7084 arc_access(hdr, 0, B_FALSE);
7085 mutex_exit(hash_lock);
7086 } else {
7087 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
7088 VERIFY3S(remove_reference(hdr, hdr), >, 0);
7089 }
7090
7091 callback->awcb_done(zio, buf, callback->awcb_private);
7092
7093 abd_free(zio->io_abd);
7094 kmem_free(callback, sizeof (arc_write_callback_t));
7095 }
7096
7097 zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t uncached,boolean_t l2arc,const zio_prop_t * zp,arc_write_done_func_t * ready,arc_write_done_func_t * children_ready,arc_write_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,const zbookmark_phys_t * zb)7098 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
7099 blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
7100 const zio_prop_t *zp, arc_write_done_func_t *ready,
7101 arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
7102 void *private, zio_priority_t priority, int zio_flags,
7103 const zbookmark_phys_t *zb)
7104 {
7105 arc_buf_hdr_t *hdr = buf->b_hdr;
7106 arc_write_callback_t *callback;
7107 zio_t *zio;
7108 zio_prop_t localprop = *zp;
7109
7110 ASSERT3P(ready, !=, NULL);
7111 ASSERT3P(done, !=, NULL);
7112 ASSERT(!HDR_IO_ERROR(hdr));
7113 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
7114 ASSERT0P(hdr->b_l1hdr.b_acb);
7115 ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
7116 if (uncached)
7117 arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
7118 else if (l2arc)
7119 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
7120
7121 if (ARC_BUF_ENCRYPTED(buf)) {
7122 ASSERT(ARC_BUF_COMPRESSED(buf));
7123 localprop.zp_encrypt = B_TRUE;
7124 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7125 localprop.zp_complevel = hdr->b_complevel;
7126 localprop.zp_byteorder =
7127 (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
7128 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
7129 memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
7130 ZIO_DATA_SALT_LEN);
7131 memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
7132 ZIO_DATA_IV_LEN);
7133 memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
7134 ZIO_DATA_MAC_LEN);
7135 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
7136 localprop.zp_nopwrite = B_FALSE;
7137 localprop.zp_copies =
7138 MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
7139 localprop.zp_gang_copies =
7140 MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
7141 }
7142 zio_flags |= ZIO_FLAG_RAW;
7143 } else if (ARC_BUF_COMPRESSED(buf)) {
7144 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
7145 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
7146 localprop.zp_complevel = hdr->b_complevel;
7147 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
7148 }
7149 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
7150 callback->awcb_ready = ready;
7151 callback->awcb_children_ready = children_ready;
7152 callback->awcb_done = done;
7153 callback->awcb_private = private;
7154 callback->awcb_buf = buf;
7155
7156 /*
7157 * The hdr's b_pabd is now stale, free it now. A new data block
7158 * will be allocated when the zio pipeline calls arc_write_ready().
7159 */
7160 if (hdr->b_l1hdr.b_pabd != NULL) {
7161 /*
7162 * If the buf is currently sharing the data block with
7163 * the hdr then we need to break that relationship here.
7164 * The hdr will remain with a NULL data pointer and the
7165 * buf will take sole ownership of the block.
7166 */
7167 if (ARC_BUF_SHARED(buf)) {
7168 arc_unshare_buf(hdr, buf);
7169 } else {
7170 ASSERT(!arc_buf_is_shared(buf));
7171 arc_hdr_free_abd(hdr, B_FALSE);
7172 }
7173 VERIFY3P(buf->b_data, !=, NULL);
7174 }
7175
7176 if (HDR_HAS_RABD(hdr))
7177 arc_hdr_free_abd(hdr, B_TRUE);
7178
7179 if (!(zio_flags & ZIO_FLAG_RAW))
7180 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
7181
7182 ASSERT(!arc_buf_is_shared(buf));
7183 ASSERT0P(hdr->b_l1hdr.b_pabd);
7184
7185 zio = zio_write(pio, spa, txg, bp,
7186 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
7187 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
7188 (children_ready != NULL) ? arc_write_children_ready : NULL,
7189 arc_write_done, callback, priority, zio_flags, zb);
7190
7191 return (zio);
7192 }
7193
7194 void
arc_tempreserve_clear(uint64_t reserve)7195 arc_tempreserve_clear(uint64_t reserve)
7196 {
7197 atomic_add_64(&arc_tempreserve, -reserve);
7198 ASSERT((int64_t)arc_tempreserve >= 0);
7199 }
7200
7201 int
arc_tempreserve_space(spa_t * spa,uint64_t reserve,uint64_t txg)7202 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
7203 {
7204 int error;
7205 uint64_t anon_size;
7206
7207 if (!arc_no_grow &&
7208 reserve > arc_c/4 &&
7209 reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
7210 arc_c = MIN(arc_c_max, reserve * 4);
7211
7212 /*
7213 * Throttle when the calculated memory footprint for the TXG
7214 * exceeds the target ARC size.
7215 */
7216 if (reserve > arc_c) {
7217 DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
7218 return (SET_ERROR(ERESTART));
7219 }
7220
7221 /*
7222 * Don't count loaned bufs as in flight dirty data to prevent long
7223 * network delays from blocking transactions that are ready to be
7224 * assigned to a txg.
7225 */
7226
7227 /* assert that it has not wrapped around */
7228 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
7229
7230 anon_size = MAX((int64_t)
7231 (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
7232 zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
7233 arc_loaned_bytes), 0);
7234
7235 /*
7236 * Writes will, almost always, require additional memory allocations
7237 * in order to compress/encrypt/etc the data. We therefore need to
7238 * make sure that there is sufficient available memory for this.
7239 */
7240 error = arc_memory_throttle(spa, reserve, txg);
7241 if (error != 0)
7242 return (error);
7243
7244 /*
7245 * Throttle writes when the amount of dirty data in the cache
7246 * gets too large. We try to keep the cache less than half full
7247 * of dirty blocks so that our sync times don't grow too large.
7248 *
7249 * In the case of one pool being built on another pool, we want
7250 * to make sure we don't end up throttling the lower (backing)
7251 * pool when the upper pool is the majority contributor to dirty
7252 * data. To insure we make forward progress during throttling, we
7253 * also check the current pool's net dirty data and only throttle
7254 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
7255 * data in the cache.
7256 *
7257 * Note: if two requests come in concurrently, we might let them
7258 * both succeed, when one of them should fail. Not a huge deal.
7259 */
7260 uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
7261 uint64_t spa_dirty_anon = spa_dirty_data(spa);
7262 uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
7263 if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
7264 anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
7265 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
7266 #ifdef ZFS_DEBUG
7267 uint64_t meta_esize = zfs_refcount_count(
7268 &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7269 uint64_t data_esize =
7270 zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7271 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
7272 "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
7273 (u_longlong_t)arc_tempreserve >> 10,
7274 (u_longlong_t)meta_esize >> 10,
7275 (u_longlong_t)data_esize >> 10,
7276 (u_longlong_t)reserve >> 10,
7277 (u_longlong_t)rarc_c >> 10);
7278 #endif
7279 DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
7280 return (SET_ERROR(ERESTART));
7281 }
7282 atomic_add_64(&arc_tempreserve, reserve);
7283 return (0);
7284 }
7285
7286 static void
arc_kstat_update_state(arc_state_t * state,kstat_named_t * size,kstat_named_t * data,kstat_named_t * metadata,kstat_named_t * evict_data,kstat_named_t * evict_metadata)7287 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
7288 kstat_named_t *data, kstat_named_t *metadata,
7289 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
7290 {
7291 data->value.ui64 =
7292 zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
7293 metadata->value.ui64 =
7294 zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
7295 size->value.ui64 = data->value.ui64 + metadata->value.ui64;
7296 evict_data->value.ui64 =
7297 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
7298 evict_metadata->value.ui64 =
7299 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
7300 }
7301
7302 static int
arc_kstat_update(kstat_t * ksp,int rw)7303 arc_kstat_update(kstat_t *ksp, int rw)
7304 {
7305 arc_stats_t *as = ksp->ks_data;
7306
7307 if (rw == KSTAT_WRITE)
7308 return (SET_ERROR(EACCES));
7309
7310 as->arcstat_hits.value.ui64 =
7311 wmsum_value(&arc_sums.arcstat_hits);
7312 as->arcstat_iohits.value.ui64 =
7313 wmsum_value(&arc_sums.arcstat_iohits);
7314 as->arcstat_misses.value.ui64 =
7315 wmsum_value(&arc_sums.arcstat_misses);
7316 as->arcstat_demand_data_hits.value.ui64 =
7317 wmsum_value(&arc_sums.arcstat_demand_data_hits);
7318 as->arcstat_demand_data_iohits.value.ui64 =
7319 wmsum_value(&arc_sums.arcstat_demand_data_iohits);
7320 as->arcstat_demand_data_misses.value.ui64 =
7321 wmsum_value(&arc_sums.arcstat_demand_data_misses);
7322 as->arcstat_demand_metadata_hits.value.ui64 =
7323 wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
7324 as->arcstat_demand_metadata_iohits.value.ui64 =
7325 wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
7326 as->arcstat_demand_metadata_misses.value.ui64 =
7327 wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
7328 as->arcstat_prefetch_data_hits.value.ui64 =
7329 wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
7330 as->arcstat_prefetch_data_iohits.value.ui64 =
7331 wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
7332 as->arcstat_prefetch_data_misses.value.ui64 =
7333 wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
7334 as->arcstat_prefetch_metadata_hits.value.ui64 =
7335 wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
7336 as->arcstat_prefetch_metadata_iohits.value.ui64 =
7337 wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
7338 as->arcstat_prefetch_metadata_misses.value.ui64 =
7339 wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
7340 as->arcstat_mru_hits.value.ui64 =
7341 wmsum_value(&arc_sums.arcstat_mru_hits);
7342 as->arcstat_mru_ghost_hits.value.ui64 =
7343 wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
7344 as->arcstat_mfu_hits.value.ui64 =
7345 wmsum_value(&arc_sums.arcstat_mfu_hits);
7346 as->arcstat_mfu_ghost_hits.value.ui64 =
7347 wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
7348 as->arcstat_uncached_hits.value.ui64 =
7349 wmsum_value(&arc_sums.arcstat_uncached_hits);
7350 as->arcstat_deleted.value.ui64 =
7351 wmsum_value(&arc_sums.arcstat_deleted);
7352 as->arcstat_mutex_miss.value.ui64 =
7353 wmsum_value(&arc_sums.arcstat_mutex_miss);
7354 as->arcstat_access_skip.value.ui64 =
7355 wmsum_value(&arc_sums.arcstat_access_skip);
7356 as->arcstat_evict_skip.value.ui64 =
7357 wmsum_value(&arc_sums.arcstat_evict_skip);
7358 as->arcstat_evict_not_enough.value.ui64 =
7359 wmsum_value(&arc_sums.arcstat_evict_not_enough);
7360 as->arcstat_evict_l2_cached.value.ui64 =
7361 wmsum_value(&arc_sums.arcstat_evict_l2_cached);
7362 as->arcstat_evict_l2_eligible.value.ui64 =
7363 wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
7364 as->arcstat_evict_l2_eligible_mfu.value.ui64 =
7365 wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
7366 as->arcstat_evict_l2_eligible_mru.value.ui64 =
7367 wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
7368 as->arcstat_evict_l2_ineligible.value.ui64 =
7369 wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
7370 as->arcstat_evict_l2_skip.value.ui64 =
7371 wmsum_value(&arc_sums.arcstat_evict_l2_skip);
7372 as->arcstat_hash_elements.value.ui64 =
7373 as->arcstat_hash_elements_max.value.ui64 =
7374 wmsum_value(&arc_sums.arcstat_hash_elements);
7375 as->arcstat_hash_collisions.value.ui64 =
7376 wmsum_value(&arc_sums.arcstat_hash_collisions);
7377 as->arcstat_hash_chains.value.ui64 =
7378 wmsum_value(&arc_sums.arcstat_hash_chains);
7379 as->arcstat_size.value.ui64 =
7380 aggsum_value(&arc_sums.arcstat_size);
7381 as->arcstat_compressed_size.value.ui64 =
7382 wmsum_value(&arc_sums.arcstat_compressed_size);
7383 as->arcstat_uncompressed_size.value.ui64 =
7384 wmsum_value(&arc_sums.arcstat_uncompressed_size);
7385 as->arcstat_overhead_size.value.ui64 =
7386 wmsum_value(&arc_sums.arcstat_overhead_size);
7387 as->arcstat_hdr_size.value.ui64 =
7388 wmsum_value(&arc_sums.arcstat_hdr_size);
7389 as->arcstat_data_size.value.ui64 =
7390 wmsum_value(&arc_sums.arcstat_data_size);
7391 as->arcstat_metadata_size.value.ui64 =
7392 wmsum_value(&arc_sums.arcstat_metadata_size);
7393 as->arcstat_dbuf_size.value.ui64 =
7394 wmsum_value(&arc_sums.arcstat_dbuf_size);
7395 #if defined(COMPAT_FREEBSD11)
7396 as->arcstat_other_size.value.ui64 =
7397 wmsum_value(&arc_sums.arcstat_bonus_size) +
7398 aggsum_value(&arc_sums.arcstat_dnode_size) +
7399 wmsum_value(&arc_sums.arcstat_dbuf_size);
7400 #endif
7401
7402 arc_kstat_update_state(arc_anon,
7403 &as->arcstat_anon_size,
7404 &as->arcstat_anon_data,
7405 &as->arcstat_anon_metadata,
7406 &as->arcstat_anon_evictable_data,
7407 &as->arcstat_anon_evictable_metadata);
7408 arc_kstat_update_state(arc_mru,
7409 &as->arcstat_mru_size,
7410 &as->arcstat_mru_data,
7411 &as->arcstat_mru_metadata,
7412 &as->arcstat_mru_evictable_data,
7413 &as->arcstat_mru_evictable_metadata);
7414 arc_kstat_update_state(arc_mru_ghost,
7415 &as->arcstat_mru_ghost_size,
7416 &as->arcstat_mru_ghost_data,
7417 &as->arcstat_mru_ghost_metadata,
7418 &as->arcstat_mru_ghost_evictable_data,
7419 &as->arcstat_mru_ghost_evictable_metadata);
7420 arc_kstat_update_state(arc_mfu,
7421 &as->arcstat_mfu_size,
7422 &as->arcstat_mfu_data,
7423 &as->arcstat_mfu_metadata,
7424 &as->arcstat_mfu_evictable_data,
7425 &as->arcstat_mfu_evictable_metadata);
7426 arc_kstat_update_state(arc_mfu_ghost,
7427 &as->arcstat_mfu_ghost_size,
7428 &as->arcstat_mfu_ghost_data,
7429 &as->arcstat_mfu_ghost_metadata,
7430 &as->arcstat_mfu_ghost_evictable_data,
7431 &as->arcstat_mfu_ghost_evictable_metadata);
7432 arc_kstat_update_state(arc_uncached,
7433 &as->arcstat_uncached_size,
7434 &as->arcstat_uncached_data,
7435 &as->arcstat_uncached_metadata,
7436 &as->arcstat_uncached_evictable_data,
7437 &as->arcstat_uncached_evictable_metadata);
7438
7439 as->arcstat_dnode_size.value.ui64 =
7440 aggsum_value(&arc_sums.arcstat_dnode_size);
7441 as->arcstat_bonus_size.value.ui64 =
7442 wmsum_value(&arc_sums.arcstat_bonus_size);
7443 as->arcstat_l2_hits.value.ui64 =
7444 wmsum_value(&arc_sums.arcstat_l2_hits);
7445 as->arcstat_l2_misses.value.ui64 =
7446 wmsum_value(&arc_sums.arcstat_l2_misses);
7447 as->arcstat_l2_prefetch_asize.value.ui64 =
7448 wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
7449 as->arcstat_l2_mru_asize.value.ui64 =
7450 wmsum_value(&arc_sums.arcstat_l2_mru_asize);
7451 as->arcstat_l2_mfu_asize.value.ui64 =
7452 wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
7453 as->arcstat_l2_bufc_data_asize.value.ui64 =
7454 wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
7455 as->arcstat_l2_bufc_metadata_asize.value.ui64 =
7456 wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
7457 as->arcstat_l2_feeds.value.ui64 =
7458 wmsum_value(&arc_sums.arcstat_l2_feeds);
7459 as->arcstat_l2_rw_clash.value.ui64 =
7460 wmsum_value(&arc_sums.arcstat_l2_rw_clash);
7461 as->arcstat_l2_read_bytes.value.ui64 =
7462 wmsum_value(&arc_sums.arcstat_l2_read_bytes);
7463 as->arcstat_l2_write_bytes.value.ui64 =
7464 wmsum_value(&arc_sums.arcstat_l2_write_bytes);
7465 as->arcstat_l2_writes_sent.value.ui64 =
7466 wmsum_value(&arc_sums.arcstat_l2_writes_sent);
7467 as->arcstat_l2_writes_done.value.ui64 =
7468 wmsum_value(&arc_sums.arcstat_l2_writes_done);
7469 as->arcstat_l2_writes_error.value.ui64 =
7470 wmsum_value(&arc_sums.arcstat_l2_writes_error);
7471 as->arcstat_l2_writes_lock_retry.value.ui64 =
7472 wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
7473 as->arcstat_l2_evict_lock_retry.value.ui64 =
7474 wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
7475 as->arcstat_l2_evict_reading.value.ui64 =
7476 wmsum_value(&arc_sums.arcstat_l2_evict_reading);
7477 as->arcstat_l2_evict_l1cached.value.ui64 =
7478 wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
7479 as->arcstat_l2_free_on_write.value.ui64 =
7480 wmsum_value(&arc_sums.arcstat_l2_free_on_write);
7481 as->arcstat_l2_abort_lowmem.value.ui64 =
7482 wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
7483 as->arcstat_l2_cksum_bad.value.ui64 =
7484 wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
7485 as->arcstat_l2_io_error.value.ui64 =
7486 wmsum_value(&arc_sums.arcstat_l2_io_error);
7487 as->arcstat_l2_lsize.value.ui64 =
7488 wmsum_value(&arc_sums.arcstat_l2_lsize);
7489 as->arcstat_l2_psize.value.ui64 =
7490 wmsum_value(&arc_sums.arcstat_l2_psize);
7491 as->arcstat_l2_hdr_size.value.ui64 =
7492 aggsum_value(&arc_sums.arcstat_l2_hdr_size);
7493 as->arcstat_l2_log_blk_writes.value.ui64 =
7494 wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
7495 as->arcstat_l2_log_blk_asize.value.ui64 =
7496 wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
7497 as->arcstat_l2_log_blk_count.value.ui64 =
7498 wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
7499 as->arcstat_l2_rebuild_success.value.ui64 =
7500 wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
7501 as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
7502 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
7503 as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
7504 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
7505 as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
7506 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
7507 as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
7508 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
7509 as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
7510 wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
7511 as->arcstat_l2_rebuild_size.value.ui64 =
7512 wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
7513 as->arcstat_l2_rebuild_asize.value.ui64 =
7514 wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
7515 as->arcstat_l2_rebuild_bufs.value.ui64 =
7516 wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
7517 as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
7518 wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
7519 as->arcstat_l2_rebuild_log_blks.value.ui64 =
7520 wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
7521 as->arcstat_memory_throttle_count.value.ui64 =
7522 wmsum_value(&arc_sums.arcstat_memory_throttle_count);
7523 as->arcstat_memory_direct_count.value.ui64 =
7524 wmsum_value(&arc_sums.arcstat_memory_direct_count);
7525 as->arcstat_memory_indirect_count.value.ui64 =
7526 wmsum_value(&arc_sums.arcstat_memory_indirect_count);
7527
7528 as->arcstat_memory_all_bytes.value.ui64 =
7529 arc_all_memory();
7530 as->arcstat_memory_free_bytes.value.ui64 =
7531 arc_free_memory();
7532 as->arcstat_memory_available_bytes.value.i64 =
7533 arc_available_memory();
7534
7535 as->arcstat_prune.value.ui64 =
7536 wmsum_value(&arc_sums.arcstat_prune);
7537 as->arcstat_meta_used.value.ui64 =
7538 wmsum_value(&arc_sums.arcstat_meta_used);
7539 as->arcstat_async_upgrade_sync.value.ui64 =
7540 wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
7541 as->arcstat_predictive_prefetch.value.ui64 =
7542 wmsum_value(&arc_sums.arcstat_predictive_prefetch);
7543 as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
7544 wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
7545 as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
7546 wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
7547 as->arcstat_prescient_prefetch.value.ui64 =
7548 wmsum_value(&arc_sums.arcstat_prescient_prefetch);
7549 as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
7550 wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
7551 as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
7552 wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
7553 as->arcstat_raw_size.value.ui64 =
7554 wmsum_value(&arc_sums.arcstat_raw_size);
7555 as->arcstat_cached_only_in_progress.value.ui64 =
7556 wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
7557 as->arcstat_abd_chunk_waste_size.value.ui64 =
7558 wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
7559
7560 return (0);
7561 }
7562
7563 /*
7564 * This function *must* return indices evenly distributed between all
7565 * sublists of the multilist. This is needed due to how the ARC eviction
7566 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
7567 * distributed between all sublists and uses this assumption when
7568 * deciding which sublist to evict from and how much to evict from it.
7569 */
7570 static unsigned int
arc_state_multilist_index_func(multilist_t * ml,void * obj)7571 arc_state_multilist_index_func(multilist_t *ml, void *obj)
7572 {
7573 arc_buf_hdr_t *hdr = obj;
7574
7575 /*
7576 * We rely on b_dva to generate evenly distributed index
7577 * numbers using buf_hash below. So, as an added precaution,
7578 * let's make sure we never add empty buffers to the arc lists.
7579 */
7580 ASSERT(!HDR_EMPTY(hdr));
7581
7582 /*
7583 * The assumption here, is the hash value for a given
7584 * arc_buf_hdr_t will remain constant throughout its lifetime
7585 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
7586 * Thus, we don't need to store the header's sublist index
7587 * on insertion, as this index can be recalculated on removal.
7588 *
7589 * Also, the low order bits of the hash value are thought to be
7590 * distributed evenly. Otherwise, in the case that the multilist
7591 * has a power of two number of sublists, each sublists' usage
7592 * would not be evenly distributed. In this context full 64bit
7593 * division would be a waste of time, so limit it to 32 bits.
7594 */
7595 return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7596 multilist_get_num_sublists(ml));
7597 }
7598
7599 static unsigned int
arc_state_l2c_multilist_index_func(multilist_t * ml,void * obj)7600 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
7601 {
7602 panic("Header %p insert into arc_l2c_only %p", obj, ml);
7603 }
7604
7605 #define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \
7606 if ((do_warn) && (tuning) && ((tuning) != (value))) { \
7607 cmn_err(CE_WARN, \
7608 "ignoring tunable %s (using %llu instead)", \
7609 (#tuning), (u_longlong_t)(value)); \
7610 } \
7611 } while (0)
7612
7613 /*
7614 * Called during module initialization and periodically thereafter to
7615 * apply reasonable changes to the exposed performance tunings. Can also be
7616 * called explicitly by param_set_arc_*() functions when ARC tunables are
7617 * updated manually. Non-zero zfs_* values which differ from the currently set
7618 * values will be applied.
7619 */
7620 void
arc_tuning_update(boolean_t verbose)7621 arc_tuning_update(boolean_t verbose)
7622 {
7623 uint64_t allmem = arc_all_memory();
7624
7625 /* Valid range: 32M - <arc_c_max> */
7626 if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
7627 (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
7628 (zfs_arc_min <= arc_c_max)) {
7629 arc_c_min = zfs_arc_min;
7630 arc_c = MAX(arc_c, arc_c_min);
7631 }
7632 WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
7633
7634 /* Valid range: 64M - <all physical memory> */
7635 if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
7636 (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
7637 (zfs_arc_max > arc_c_min)) {
7638 arc_c_max = zfs_arc_max;
7639 arc_c = MIN(arc_c, arc_c_max);
7640 if (arc_dnode_limit > arc_c_max)
7641 arc_dnode_limit = arc_c_max;
7642 }
7643 WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
7644
7645 /* Valid range: 0 - <all physical memory> */
7646 arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
7647 MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
7648 WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
7649
7650 /* Valid range: 1 - N */
7651 if (zfs_arc_grow_retry)
7652 arc_grow_retry = zfs_arc_grow_retry;
7653
7654 /* Valid range: 1 - N */
7655 if (zfs_arc_shrink_shift) {
7656 arc_shrink_shift = zfs_arc_shrink_shift;
7657 arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
7658 }
7659
7660 /* Valid range: 1 - N ms */
7661 if (zfs_arc_min_prefetch_ms)
7662 arc_min_prefetch = MSEC_TO_TICK(zfs_arc_min_prefetch_ms);
7663
7664 /* Valid range: 1 - N ms */
7665 if (zfs_arc_min_prescient_prefetch_ms) {
7666 arc_min_prescient_prefetch =
7667 MSEC_TO_TICK(zfs_arc_min_prescient_prefetch_ms);
7668 }
7669
7670 /* Valid range: 0 - 100 */
7671 if (zfs_arc_lotsfree_percent <= 100)
7672 arc_lotsfree_percent = zfs_arc_lotsfree_percent;
7673 WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
7674 verbose);
7675
7676 /* Valid range: 0 - <all physical memory> */
7677 if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
7678 arc_sys_free = MIN(zfs_arc_sys_free, allmem);
7679 WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
7680 }
7681
7682 static void
arc_state_multilist_init(multilist_t * ml,multilist_sublist_index_func_t * index_func,int * maxcountp)7683 arc_state_multilist_init(multilist_t *ml,
7684 multilist_sublist_index_func_t *index_func, int *maxcountp)
7685 {
7686 multilist_create(ml, sizeof (arc_buf_hdr_t),
7687 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
7688 *maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
7689 }
7690
7691 static void
arc_state_init(void)7692 arc_state_init(void)
7693 {
7694 int num_sublists = 0;
7695
7696 arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
7697 arc_state_multilist_index_func, &num_sublists);
7698 arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
7699 arc_state_multilist_index_func, &num_sublists);
7700 arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
7701 arc_state_multilist_index_func, &num_sublists);
7702 arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
7703 arc_state_multilist_index_func, &num_sublists);
7704 arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
7705 arc_state_multilist_index_func, &num_sublists);
7706 arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
7707 arc_state_multilist_index_func, &num_sublists);
7708 arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
7709 arc_state_multilist_index_func, &num_sublists);
7710 arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
7711 arc_state_multilist_index_func, &num_sublists);
7712 arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
7713 arc_state_multilist_index_func, &num_sublists);
7714 arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
7715 arc_state_multilist_index_func, &num_sublists);
7716
7717 /*
7718 * L2 headers should never be on the L2 state list since they don't
7719 * have L1 headers allocated. Special index function asserts that.
7720 */
7721 arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
7722 arc_state_l2c_multilist_index_func, &num_sublists);
7723 arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
7724 arc_state_l2c_multilist_index_func, &num_sublists);
7725
7726 /*
7727 * Keep track of the number of markers needed to reclaim buffers from
7728 * any ARC state. The markers will be pre-allocated so as to minimize
7729 * the number of memory allocations performed by the eviction thread.
7730 */
7731 arc_state_evict_marker_count = num_sublists;
7732
7733 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7734 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7735 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7736 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7737 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7738 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7739 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7740 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7741 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7742 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7743 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7744 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7745 zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
7746 zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
7747
7748 zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
7749 zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
7750 zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
7751 zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
7752 zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
7753 zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
7754 zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
7755 zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
7756 zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
7757 zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
7758 zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
7759 zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
7760 zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
7761 zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
7762
7763 wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
7764 wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
7765 wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
7766 wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
7767
7768 wmsum_init(&arc_sums.arcstat_hits, 0);
7769 wmsum_init(&arc_sums.arcstat_iohits, 0);
7770 wmsum_init(&arc_sums.arcstat_misses, 0);
7771 wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
7772 wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
7773 wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
7774 wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
7775 wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
7776 wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
7777 wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
7778 wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
7779 wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
7780 wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
7781 wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
7782 wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
7783 wmsum_init(&arc_sums.arcstat_mru_hits, 0);
7784 wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
7785 wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
7786 wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
7787 wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
7788 wmsum_init(&arc_sums.arcstat_deleted, 0);
7789 wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
7790 wmsum_init(&arc_sums.arcstat_access_skip, 0);
7791 wmsum_init(&arc_sums.arcstat_evict_skip, 0);
7792 wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
7793 wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
7794 wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
7795 wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
7796 wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
7797 wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
7798 wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
7799 wmsum_init(&arc_sums.arcstat_hash_elements, 0);
7800 wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
7801 wmsum_init(&arc_sums.arcstat_hash_chains, 0);
7802 aggsum_init(&arc_sums.arcstat_size, 0);
7803 wmsum_init(&arc_sums.arcstat_compressed_size, 0);
7804 wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
7805 wmsum_init(&arc_sums.arcstat_overhead_size, 0);
7806 wmsum_init(&arc_sums.arcstat_hdr_size, 0);
7807 wmsum_init(&arc_sums.arcstat_data_size, 0);
7808 wmsum_init(&arc_sums.arcstat_metadata_size, 0);
7809 wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
7810 aggsum_init(&arc_sums.arcstat_dnode_size, 0);
7811 wmsum_init(&arc_sums.arcstat_bonus_size, 0);
7812 wmsum_init(&arc_sums.arcstat_l2_hits, 0);
7813 wmsum_init(&arc_sums.arcstat_l2_misses, 0);
7814 wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
7815 wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
7816 wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
7817 wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
7818 wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
7819 wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
7820 wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
7821 wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
7822 wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
7823 wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
7824 wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
7825 wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
7826 wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
7827 wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
7828 wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
7829 wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
7830 wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
7831 wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
7832 wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
7833 wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
7834 wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
7835 wmsum_init(&arc_sums.arcstat_l2_psize, 0);
7836 aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
7837 wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
7838 wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
7839 wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
7840 wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
7841 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
7842 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
7843 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
7844 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
7845 wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
7846 wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
7847 wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
7848 wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
7849 wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
7850 wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
7851 wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
7852 wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
7853 wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
7854 wmsum_init(&arc_sums.arcstat_prune, 0);
7855 wmsum_init(&arc_sums.arcstat_meta_used, 0);
7856 wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
7857 wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
7858 wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
7859 wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
7860 wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
7861 wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
7862 wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
7863 wmsum_init(&arc_sums.arcstat_raw_size, 0);
7864 wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
7865 wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
7866
7867 arc_anon->arcs_state = ARC_STATE_ANON;
7868 arc_mru->arcs_state = ARC_STATE_MRU;
7869 arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
7870 arc_mfu->arcs_state = ARC_STATE_MFU;
7871 arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
7872 arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
7873 arc_uncached->arcs_state = ARC_STATE_UNCACHED;
7874 }
7875
7876 static void
arc_state_fini(void)7877 arc_state_fini(void)
7878 {
7879 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7880 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7881 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7882 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7883 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7884 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7885 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7886 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7887 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7888 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7889 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7890 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7891 zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
7892 zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
7893
7894 zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
7895 zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
7896 zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
7897 zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
7898 zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
7899 zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
7900 zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
7901 zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
7902 zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
7903 zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
7904 zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
7905 zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
7906 zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
7907 zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
7908
7909 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
7910 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7911 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7912 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7913 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
7914 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7915 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
7916 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7917 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7918 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
7919 multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
7920 multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
7921
7922 wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
7923 wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
7924 wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
7925 wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
7926
7927 wmsum_fini(&arc_sums.arcstat_hits);
7928 wmsum_fini(&arc_sums.arcstat_iohits);
7929 wmsum_fini(&arc_sums.arcstat_misses);
7930 wmsum_fini(&arc_sums.arcstat_demand_data_hits);
7931 wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
7932 wmsum_fini(&arc_sums.arcstat_demand_data_misses);
7933 wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
7934 wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
7935 wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
7936 wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
7937 wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
7938 wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
7939 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
7940 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
7941 wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
7942 wmsum_fini(&arc_sums.arcstat_mru_hits);
7943 wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
7944 wmsum_fini(&arc_sums.arcstat_mfu_hits);
7945 wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
7946 wmsum_fini(&arc_sums.arcstat_uncached_hits);
7947 wmsum_fini(&arc_sums.arcstat_deleted);
7948 wmsum_fini(&arc_sums.arcstat_mutex_miss);
7949 wmsum_fini(&arc_sums.arcstat_access_skip);
7950 wmsum_fini(&arc_sums.arcstat_evict_skip);
7951 wmsum_fini(&arc_sums.arcstat_evict_not_enough);
7952 wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
7953 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
7954 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
7955 wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
7956 wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
7957 wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
7958 wmsum_fini(&arc_sums.arcstat_hash_elements);
7959 wmsum_fini(&arc_sums.arcstat_hash_collisions);
7960 wmsum_fini(&arc_sums.arcstat_hash_chains);
7961 aggsum_fini(&arc_sums.arcstat_size);
7962 wmsum_fini(&arc_sums.arcstat_compressed_size);
7963 wmsum_fini(&arc_sums.arcstat_uncompressed_size);
7964 wmsum_fini(&arc_sums.arcstat_overhead_size);
7965 wmsum_fini(&arc_sums.arcstat_hdr_size);
7966 wmsum_fini(&arc_sums.arcstat_data_size);
7967 wmsum_fini(&arc_sums.arcstat_metadata_size);
7968 wmsum_fini(&arc_sums.arcstat_dbuf_size);
7969 aggsum_fini(&arc_sums.arcstat_dnode_size);
7970 wmsum_fini(&arc_sums.arcstat_bonus_size);
7971 wmsum_fini(&arc_sums.arcstat_l2_hits);
7972 wmsum_fini(&arc_sums.arcstat_l2_misses);
7973 wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
7974 wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
7975 wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
7976 wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
7977 wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
7978 wmsum_fini(&arc_sums.arcstat_l2_feeds);
7979 wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
7980 wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
7981 wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
7982 wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
7983 wmsum_fini(&arc_sums.arcstat_l2_writes_done);
7984 wmsum_fini(&arc_sums.arcstat_l2_writes_error);
7985 wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
7986 wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
7987 wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
7988 wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
7989 wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
7990 wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
7991 wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
7992 wmsum_fini(&arc_sums.arcstat_l2_io_error);
7993 wmsum_fini(&arc_sums.arcstat_l2_lsize);
7994 wmsum_fini(&arc_sums.arcstat_l2_psize);
7995 aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
7996 wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
7997 wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
7998 wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
7999 wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
8000 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
8001 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
8002 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
8003 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
8004 wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
8005 wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
8006 wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
8007 wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
8008 wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
8009 wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
8010 wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
8011 wmsum_fini(&arc_sums.arcstat_memory_direct_count);
8012 wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
8013 wmsum_fini(&arc_sums.arcstat_prune);
8014 wmsum_fini(&arc_sums.arcstat_meta_used);
8015 wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
8016 wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
8017 wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
8018 wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
8019 wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
8020 wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
8021 wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
8022 wmsum_fini(&arc_sums.arcstat_raw_size);
8023 wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
8024 wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
8025 }
8026
8027 uint64_t
arc_target_bytes(void)8028 arc_target_bytes(void)
8029 {
8030 return (arc_c);
8031 }
8032
8033 void
arc_set_limits(uint64_t allmem)8034 arc_set_limits(uint64_t allmem)
8035 {
8036 /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
8037 arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
8038
8039 /* How to set default max varies by platform. */
8040 arc_c_max = arc_default_max(arc_c_min, allmem);
8041 }
8042
8043 void
arc_init(void)8044 arc_init(void)
8045 {
8046 uint64_t percent, allmem = arc_all_memory();
8047 mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
8048 list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
8049 offsetof(arc_evict_waiter_t, aew_node));
8050
8051 arc_min_prefetch = MSEC_TO_TICK(1000);
8052 arc_min_prescient_prefetch = MSEC_TO_TICK(6000);
8053
8054 #if defined(_KERNEL)
8055 arc_lowmem_init();
8056 #endif
8057
8058 arc_set_limits(allmem);
8059
8060 #ifdef _KERNEL
8061 /*
8062 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
8063 * environment before the module was loaded, don't block setting the
8064 * maximum because it is less than arc_c_min, instead, reset arc_c_min
8065 * to a lower value.
8066 * zfs_arc_min will be handled by arc_tuning_update().
8067 */
8068 if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
8069 zfs_arc_max < allmem) {
8070 arc_c_max = zfs_arc_max;
8071 if (arc_c_min >= arc_c_max) {
8072 arc_c_min = MAX(zfs_arc_max / 2,
8073 2ULL << SPA_MAXBLOCKSHIFT);
8074 }
8075 }
8076 #else
8077 /*
8078 * In userland, there's only the memory pressure that we artificially
8079 * create (see arc_available_memory()). Don't let arc_c get too
8080 * small, because it can cause transactions to be larger than
8081 * arc_c, causing arc_tempreserve_space() to fail.
8082 */
8083 arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
8084 #endif
8085
8086 arc_c = arc_c_min;
8087 /*
8088 * 32-bit fixed point fractions of metadata from total ARC size,
8089 * MRU data from all data and MRU metadata from all metadata.
8090 */
8091 arc_meta = (1ULL << 32) / 4; /* Metadata is 25% of arc_c. */
8092 arc_pd = (1ULL << 32) / 2; /* Data MRU is 50% of data. */
8093 arc_pm = (1ULL << 32) / 2; /* Metadata MRU is 50% of metadata. */
8094
8095 percent = MIN(zfs_arc_dnode_limit_percent, 100);
8096 arc_dnode_limit = arc_c_max * percent / 100;
8097
8098 /* Apply user specified tunings */
8099 arc_tuning_update(B_TRUE);
8100
8101 /* if kmem_flags are set, lets try to use less memory */
8102 if (kmem_debugging())
8103 arc_c = arc_c / 2;
8104 if (arc_c < arc_c_min)
8105 arc_c = arc_c_min;
8106
8107 arc_register_hotplug();
8108
8109 arc_state_init();
8110
8111 buf_init();
8112
8113 list_create(&arc_prune_list, sizeof (arc_prune_t),
8114 offsetof(arc_prune_t, p_node));
8115 mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
8116
8117 arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
8118 defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
8119
8120 arc_evict_thread_init();
8121
8122 list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
8123 offsetof(arc_async_flush_t, af_node));
8124 mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
8125 arc_flush_taskq = taskq_create("arc_flush", MIN(boot_ncpus, 4),
8126 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
8127
8128 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
8129 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
8130
8131 if (arc_ksp != NULL) {
8132 arc_ksp->ks_data = &arc_stats;
8133 arc_ksp->ks_update = arc_kstat_update;
8134 kstat_install(arc_ksp);
8135 }
8136
8137 arc_state_evict_markers =
8138 arc_state_alloc_markers(arc_state_evict_marker_count);
8139 arc_evict_zthr = zthr_create_timer("arc_evict",
8140 arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
8141 arc_reap_zthr = zthr_create_timer("arc_reap",
8142 arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
8143
8144 arc_warm = B_FALSE;
8145
8146 /*
8147 * Calculate maximum amount of dirty data per pool.
8148 *
8149 * If it has been set by a module parameter, take that.
8150 * Otherwise, use a percentage of physical memory defined by
8151 * zfs_dirty_data_max_percent (default 10%) with a cap at
8152 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
8153 */
8154 #ifdef __LP64__
8155 if (zfs_dirty_data_max_max == 0)
8156 zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
8157 allmem * zfs_dirty_data_max_max_percent / 100);
8158 #else
8159 if (zfs_dirty_data_max_max == 0)
8160 zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
8161 allmem * zfs_dirty_data_max_max_percent / 100);
8162 #endif
8163
8164 if (zfs_dirty_data_max == 0) {
8165 zfs_dirty_data_max = allmem *
8166 zfs_dirty_data_max_percent / 100;
8167 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
8168 zfs_dirty_data_max_max);
8169 }
8170
8171 if (zfs_wrlog_data_max == 0) {
8172
8173 /*
8174 * dp_wrlog_total is reduced for each txg at the end of
8175 * spa_sync(). However, dp_dirty_total is reduced every time
8176 * a block is written out. Thus under normal operation,
8177 * dp_wrlog_total could grow 2 times as big as
8178 * zfs_dirty_data_max.
8179 */
8180 zfs_wrlog_data_max = zfs_dirty_data_max * 2;
8181 }
8182 }
8183
8184 void
arc_fini(void)8185 arc_fini(void)
8186 {
8187 arc_prune_t *p;
8188
8189 #ifdef _KERNEL
8190 arc_lowmem_fini();
8191 #endif /* _KERNEL */
8192
8193 /* Wait for any background flushes */
8194 taskq_wait(arc_flush_taskq);
8195 taskq_destroy(arc_flush_taskq);
8196
8197 /* Use B_TRUE to ensure *all* buffers are evicted */
8198 arc_flush(NULL, B_TRUE);
8199
8200 if (arc_ksp != NULL) {
8201 kstat_delete(arc_ksp);
8202 arc_ksp = NULL;
8203 }
8204
8205 taskq_wait(arc_prune_taskq);
8206 taskq_destroy(arc_prune_taskq);
8207
8208 list_destroy(&arc_async_flush_list);
8209 mutex_destroy(&arc_async_flush_lock);
8210
8211 mutex_enter(&arc_prune_mtx);
8212 while ((p = list_remove_head(&arc_prune_list)) != NULL) {
8213 (void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
8214 zfs_refcount_destroy(&p->p_refcnt);
8215 kmem_free(p, sizeof (*p));
8216 }
8217 mutex_exit(&arc_prune_mtx);
8218
8219 list_destroy(&arc_prune_list);
8220 mutex_destroy(&arc_prune_mtx);
8221
8222 if (arc_evict_taskq != NULL)
8223 taskq_wait(arc_evict_taskq);
8224
8225 (void) zthr_cancel(arc_evict_zthr);
8226 (void) zthr_cancel(arc_reap_zthr);
8227 arc_state_free_markers(arc_state_evict_markers,
8228 arc_state_evict_marker_count);
8229
8230 if (arc_evict_taskq != NULL) {
8231 taskq_destroy(arc_evict_taskq);
8232 kmem_free(arc_evict_arg,
8233 sizeof (evict_arg_t) * zfs_arc_evict_threads);
8234 }
8235
8236 mutex_destroy(&arc_evict_lock);
8237 list_destroy(&arc_evict_waiters);
8238
8239 /*
8240 * Free any buffers that were tagged for destruction. This needs
8241 * to occur before arc_state_fini() runs and destroys the aggsum
8242 * values which are updated when freeing scatter ABDs.
8243 * Pass NULL to free all ABDs regardless of device.
8244 */
8245 l2arc_do_free_on_write(NULL);
8246
8247 /*
8248 * buf_fini() must proceed arc_state_fini() because buf_fin() may
8249 * trigger the release of kmem magazines, which can callback to
8250 * arc_space_return() which accesses aggsums freed in act_state_fini().
8251 */
8252 buf_fini();
8253 arc_state_fini();
8254
8255 arc_unregister_hotplug();
8256
8257 /*
8258 * We destroy the zthrs after all the ARC state has been
8259 * torn down to avoid the case of them receiving any
8260 * wakeup() signals after they are destroyed.
8261 */
8262 zthr_destroy(arc_evict_zthr);
8263 zthr_destroy(arc_reap_zthr);
8264
8265 ASSERT0(arc_loaned_bytes);
8266 }
8267
8268 /*
8269 * Level 2 ARC
8270 *
8271 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
8272 * It uses dedicated storage devices to hold cached data, which are populated
8273 * using large infrequent writes. The main role of this cache is to boost
8274 * the performance of random read workloads. The intended L2ARC devices
8275 * include short-stroked disks, solid state disks, and other media with
8276 * substantially faster read latency than disk.
8277 *
8278 * +-----------------------+
8279 * | ARC |
8280 * +-----------------------+
8281 * | ^ ^
8282 * | | |
8283 * l2arc_feed_thread() arc_read()
8284 * | | |
8285 * | l2arc read |
8286 * V | |
8287 * +---------------+ |
8288 * | L2ARC | |
8289 * +---------------+ |
8290 * | ^ |
8291 * l2arc_write() | |
8292 * | | |
8293 * V | |
8294 * +-------+ +-------+
8295 * | vdev | | vdev |
8296 * | cache | | cache |
8297 * +-------+ +-------+
8298 * +=========+ .-----.
8299 * : L2ARC : |-_____-|
8300 * : devices : | Disks |
8301 * +=========+ `-_____-'
8302 *
8303 * Read requests are satisfied from the following sources, in order:
8304 *
8305 * 1) ARC
8306 * 2) vdev cache of L2ARC devices
8307 * 3) L2ARC devices
8308 * 4) vdev cache of disks
8309 * 5) disks
8310 *
8311 * Some L2ARC device types exhibit extremely slow write performance.
8312 * To accommodate for this there are some significant differences between
8313 * the L2ARC and traditional cache design:
8314 *
8315 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
8316 * the ARC behave as usual, freeing buffers and placing headers on ghost
8317 * lists. The ARC does not send buffers to the L2ARC during eviction as
8318 * this would add inflated write latencies for all ARC memory pressure.
8319 *
8320 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
8321 * It does this by periodically scanning buffers from the eviction-end of
8322 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
8323 * not already there. It scans until a headroom of buffers is satisfied,
8324 * which itself is a buffer for ARC eviction. If a compressible buffer is
8325 * found during scanning and selected for writing to an L2ARC device, we
8326 * temporarily boost scanning headroom during the next scan cycle to make
8327 * sure we adapt to compression effects (which might significantly reduce
8328 * the data volume we write to L2ARC). The thread that does this is
8329 * l2arc_feed_thread(), illustrated below; example sizes are included to
8330 * provide a better sense of ratio than this diagram:
8331 *
8332 * head --> tail
8333 * +---------------------+----------+
8334 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
8335 * +---------------------+----------+ | o L2ARC eligible
8336 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
8337 * +---------------------+----------+ |
8338 * 15.9 Gbytes ^ 32 Mbytes |
8339 * headroom |
8340 * l2arc_feed_thread()
8341 * |
8342 * l2arc write hand <--[oooo]--'
8343 * | 8 Mbyte
8344 * | write max
8345 * V
8346 * +==============================+
8347 * L2ARC dev |####|#|###|###| |####| ... |
8348 * +==============================+
8349 * 32 Gbytes
8350 *
8351 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
8352 * evicted, then the L2ARC has cached a buffer much sooner than it probably
8353 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
8354 * safe to say that this is an uncommon case, since buffers at the end of
8355 * the ARC lists have moved there due to inactivity.
8356 *
8357 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
8358 * then the L2ARC simply misses copying some buffers. This serves as a
8359 * pressure valve to prevent heavy read workloads from both stalling the ARC
8360 * with waits and clogging the L2ARC with writes. This also helps prevent
8361 * the potential for the L2ARC to churn if it attempts to cache content too
8362 * quickly, such as during backups of the entire pool.
8363 *
8364 * 5. After system boot and before the ARC has filled main memory, there are
8365 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
8366 * lists can remain mostly static. Instead of searching from tail of these
8367 * lists as pictured, the l2arc_feed_thread() will search from the list heads
8368 * for eligible buffers, greatly increasing its chance of finding them.
8369 *
8370 * The L2ARC device write speed is also boosted during this time so that
8371 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
8372 * there are no L2ARC reads, and no fear of degrading read performance
8373 * through increased writes.
8374 *
8375 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
8376 * the vdev queue can aggregate them into larger and fewer writes. Each
8377 * device is written to in a rotor fashion, sweeping writes through
8378 * available space then repeating.
8379 *
8380 * 7. The L2ARC does not store dirty content. It never needs to flush
8381 * write buffers back to disk based storage.
8382 *
8383 * 8. If an ARC buffer is written (and dirtied) which also exists in the
8384 * L2ARC, the now stale L2ARC buffer is immediately dropped.
8385 *
8386 * The performance of the L2ARC can be tweaked by a number of tunables, which
8387 * may be necessary for different workloads:
8388 *
8389 * l2arc_write_max max write bytes per interval
8390 * l2arc_dwpd_limit device write endurance limit (100 = 1.0 DWPD)
8391 * l2arc_noprefetch skip caching prefetched buffers
8392 * l2arc_headroom number of max device writes to precache
8393 * l2arc_headroom_boost when we find compressed buffers during ARC
8394 * scanning, we multiply headroom by this
8395 * percentage factor for the next scan cycle,
8396 * since more compressed buffers are likely to
8397 * be present
8398 * l2arc_feed_secs seconds between L2ARC writing
8399 *
8400 * Tunables may be removed or added as future performance improvements are
8401 * integrated, and also may become zpool properties.
8402 *
8403 * There are three key functions that control how the L2ARC warms up:
8404 *
8405 * l2arc_write_eligible() check if a buffer is eligible to cache
8406 * l2arc_write_size() calculate how much to write
8407 *
8408 * These three functions determine what to write, how much, and how quickly
8409 * to send writes.
8410 *
8411 * L2ARC persistence:
8412 *
8413 * When writing buffers to L2ARC, we periodically add some metadata to
8414 * make sure we can pick them up after reboot, thus dramatically reducing
8415 * the impact that any downtime has on the performance of storage systems
8416 * with large caches.
8417 *
8418 * The implementation works fairly simply by integrating the following two
8419 * modifications:
8420 *
8421 * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
8422 * which is an additional piece of metadata which describes what's been
8423 * written. This allows us to rebuild the arc_buf_hdr_t structures of the
8424 * main ARC buffers. There are 2 linked-lists of log blocks headed by
8425 * dh_start_lbps[2]. We alternate which chain we append to, so they are
8426 * time-wise and offset-wise interleaved, but that is an optimization rather
8427 * than for correctness. The log block also includes a pointer to the
8428 * previous block in its chain.
8429 *
8430 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
8431 * for our header bookkeeping purposes. This contains a device header,
8432 * which contains our top-level reference structures. We update it each
8433 * time we write a new log block, so that we're able to locate it in the
8434 * L2ARC device. If this write results in an inconsistent device header
8435 * (e.g. due to power failure), we detect this by verifying the header's
8436 * checksum and simply fail to reconstruct the L2ARC after reboot.
8437 *
8438 * Implementation diagram:
8439 *
8440 * +=== L2ARC device (not to scale) ======================================+
8441 * | ___two newest log block pointers__.__________ |
8442 * | / \dh_start_lbps[1] |
8443 * | / \ \dh_start_lbps[0]|
8444 * |.___/__. V V |
8445 * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
8446 * || hdr| ^ /^ /^ / / |
8447 * |+------+ ...--\-------/ \-----/--\------/ / |
8448 * | \--------------/ \--------------/ |
8449 * +======================================================================+
8450 *
8451 * As can be seen on the diagram, rather than using a simple linked list,
8452 * we use a pair of linked lists with alternating elements. This is a
8453 * performance enhancement due to the fact that we only find out the
8454 * address of the next log block access once the current block has been
8455 * completely read in. Obviously, this hurts performance, because we'd be
8456 * keeping the device's I/O queue at only a 1 operation deep, thus
8457 * incurring a large amount of I/O round-trip latency. Having two lists
8458 * allows us to fetch two log blocks ahead of where we are currently
8459 * rebuilding L2ARC buffers.
8460 *
8461 * On-device data structures:
8462 *
8463 * L2ARC device header: l2arc_dev_hdr_phys_t
8464 * L2ARC log block: l2arc_log_blk_phys_t
8465 *
8466 * L2ARC reconstruction:
8467 *
8468 * When writing data, we simply write in the standard rotary fashion,
8469 * evicting buffers as we go and simply writing new data over them (writing
8470 * a new log block every now and then). This obviously means that once we
8471 * loop around the end of the device, we will start cutting into an already
8472 * committed log block (and its referenced data buffers), like so:
8473 *
8474 * current write head__ __old tail
8475 * \ /
8476 * V V
8477 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
8478 * ^ ^^^^^^^^^___________________________________
8479 * | \
8480 * <<nextwrite>> may overwrite this blk and/or its bufs --'
8481 *
8482 * When importing the pool, we detect this situation and use it to stop
8483 * our scanning process (see l2arc_rebuild).
8484 *
8485 * There is one significant caveat to consider when rebuilding ARC contents
8486 * from an L2ARC device: what about invalidated buffers? Given the above
8487 * construction, we cannot update blocks which we've already written to amend
8488 * them to remove buffers which were invalidated. Thus, during reconstruction,
8489 * we might be populating the cache with buffers for data that's not on the
8490 * main pool anymore, or may have been overwritten!
8491 *
8492 * As it turns out, this isn't a problem. Every arc_read request includes
8493 * both the DVA and, crucially, the birth TXG of the BP the caller is
8494 * looking for. So even if the cache were populated by completely rotten
8495 * blocks for data that had been long deleted and/or overwritten, we'll
8496 * never actually return bad data from the cache, since the DVA with the
8497 * birth TXG uniquely identify a block in space and time - once created,
8498 * a block is immutable on disk. The worst thing we have done is wasted
8499 * some time and memory at l2arc rebuild to reconstruct outdated ARC
8500 * entries that will get dropped from the l2arc as it is being updated
8501 * with new blocks.
8502 *
8503 * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
8504 * hand are not restored. This is done by saving the offset (in bytes)
8505 * l2arc_evict() has evicted to in the L2ARC device header and taking it
8506 * into account when restoring buffers.
8507 */
8508
8509 static boolean_t
l2arc_write_eligible(uint64_t spa_guid,arc_buf_hdr_t * hdr)8510 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
8511 {
8512 /*
8513 * A buffer is *not* eligible for the L2ARC if it:
8514 * 1. belongs to a different spa.
8515 * 2. is already cached on the L2ARC.
8516 * 3. has an I/O in progress (it may be an incomplete read).
8517 * 4. is flagged not eligible (zfs property).
8518 */
8519 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
8520 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
8521 return (B_FALSE);
8522
8523 return (B_TRUE);
8524 }
8525
8526 static uint64_t
l2arc_write_size(l2arc_dev_t * dev,clock_t * interval)8527 l2arc_write_size(l2arc_dev_t *dev, clock_t *interval)
8528 {
8529 uint64_t size;
8530 uint64_t write_rate = l2arc_get_write_rate(dev);
8531
8532 if (write_rate > L2ARC_BURST_SIZE_MAX) {
8533 /* Calculate interval to achieve desired rate with burst cap */
8534 uint64_t feeds_per_sec =
8535 MAX(DIV_ROUND_UP(write_rate, L2ARC_BURST_SIZE_MAX), 1);
8536 *interval = hz / feeds_per_sec;
8537 size = write_rate / feeds_per_sec;
8538 } else {
8539 *interval = hz; /* 1 second default */
8540 size = write_rate;
8541 }
8542
8543 /* We need to add in the worst case scenario of log block overhead. */
8544 size += l2arc_log_blk_overhead(size, dev);
8545 if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
8546 /*
8547 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
8548 * times the writesize, whichever is greater.
8549 */
8550 size += MAX(64 * 1024 * 1024,
8551 (size * l2arc_trim_ahead) / 100);
8552 }
8553
8554 /*
8555 * Make sure the write size does not exceed the size of the cache
8556 * device. This is important in l2arc_evict(), otherwise infinite
8557 * iteration can occur.
8558 */
8559 size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
8560
8561 size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
8562
8563 return (size);
8564
8565 }
8566
8567 /*
8568 * Free buffers that were tagged for destruction.
8569 */
8570 static void
l2arc_do_free_on_write(l2arc_dev_t * dev)8571 l2arc_do_free_on_write(l2arc_dev_t *dev)
8572 {
8573 l2arc_data_free_t *df, *df_next;
8574 boolean_t all = (dev == NULL);
8575
8576 mutex_enter(&l2arc_free_on_write_mtx);
8577 df = list_head(l2arc_free_on_write);
8578 while (df != NULL) {
8579 df_next = list_next(l2arc_free_on_write, df);
8580 if (all || df->l2df_dev == dev) {
8581 list_remove(l2arc_free_on_write, df);
8582 ASSERT3P(df->l2df_abd, !=, NULL);
8583 abd_free(df->l2df_abd);
8584 kmem_free(df, sizeof (l2arc_data_free_t));
8585 }
8586 df = df_next;
8587 }
8588 mutex_exit(&l2arc_free_on_write_mtx);
8589 }
8590
8591 /*
8592 * A write to a cache device has completed. Update all headers to allow
8593 * reads from these buffers to begin.
8594 */
8595 static void
l2arc_write_done(zio_t * zio)8596 l2arc_write_done(zio_t *zio)
8597 {
8598 l2arc_write_callback_t *cb;
8599 l2arc_lb_abd_buf_t *abd_buf;
8600 l2arc_lb_ptr_buf_t *lb_ptr_buf;
8601 l2arc_dev_t *dev;
8602 l2arc_dev_hdr_phys_t *l2dhdr;
8603 list_t *buflist;
8604 arc_buf_hdr_t *head, *hdr, *hdr_prev;
8605 kmutex_t *hash_lock;
8606 int64_t bytes_dropped = 0;
8607
8608 cb = zio->io_private;
8609 ASSERT3P(cb, !=, NULL);
8610 dev = cb->l2wcb_dev;
8611 l2dhdr = dev->l2ad_dev_hdr;
8612 ASSERT3P(dev, !=, NULL);
8613 head = cb->l2wcb_head;
8614 ASSERT3P(head, !=, NULL);
8615 buflist = &dev->l2ad_buflist;
8616 ASSERT3P(buflist, !=, NULL);
8617 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
8618 l2arc_write_callback_t *, cb);
8619
8620 /*
8621 * All writes completed, or an error was hit.
8622 */
8623 top:
8624 mutex_enter(&dev->l2ad_mtx);
8625 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
8626 hdr_prev = list_prev(buflist, hdr);
8627
8628 hash_lock = HDR_LOCK(hdr);
8629
8630 /*
8631 * We cannot use mutex_enter or else we can deadlock
8632 * with l2arc_write_buffers (due to swapping the order
8633 * the hash lock and l2ad_mtx are taken).
8634 */
8635 if (!mutex_tryenter(hash_lock)) {
8636 /*
8637 * Missed the hash lock. We must retry so we
8638 * don't leave the ARC_FLAG_L2_WRITING bit set.
8639 */
8640 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
8641
8642 /*
8643 * We don't want to rescan the headers we've
8644 * already marked as having been written out, so
8645 * we reinsert the head node so we can pick up
8646 * where we left off.
8647 */
8648 list_remove(buflist, head);
8649 list_insert_after(buflist, hdr, head);
8650
8651 mutex_exit(&dev->l2ad_mtx);
8652
8653 /*
8654 * We wait for the hash lock to become available
8655 * to try and prevent busy waiting, and increase
8656 * the chance we'll be able to acquire the lock
8657 * the next time around.
8658 */
8659 mutex_enter(hash_lock);
8660 mutex_exit(hash_lock);
8661 goto top;
8662 }
8663
8664 /*
8665 * We could not have been moved into the arc_l2c_only
8666 * state while in-flight due to our ARC_FLAG_L2_WRITING
8667 * bit being set. Let's just ensure that's being enforced.
8668 */
8669 ASSERT(HDR_HAS_L1HDR(hdr));
8670
8671 /*
8672 * Skipped - drop L2ARC entry and mark the header as no
8673 * longer L2 eligibile.
8674 */
8675 if (zio->io_error != 0) {
8676 /*
8677 * Error - drop L2ARC entry.
8678 */
8679 list_remove(buflist, hdr);
8680 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
8681
8682 uint64_t psize = HDR_GET_PSIZE(hdr);
8683 l2arc_hdr_arcstats_decrement(hdr);
8684
8685 ASSERT(dev->l2ad_vdev != NULL);
8686
8687 bytes_dropped +=
8688 vdev_psize_to_asize(dev->l2ad_vdev, psize);
8689 (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
8690 arc_hdr_size(hdr), hdr);
8691 }
8692
8693 /*
8694 * Allow ARC to begin reads and ghost list evictions to
8695 * this L2ARC entry.
8696 */
8697 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
8698
8699 mutex_exit(hash_lock);
8700 }
8701
8702 /*
8703 * Free the allocated abd buffers for writing the log blocks.
8704 * If the zio failed reclaim the allocated space and remove the
8705 * pointers to these log blocks from the log block pointer list
8706 * of the L2ARC device.
8707 */
8708 while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
8709 abd_free(abd_buf->abd);
8710 zio_buf_free(abd_buf, sizeof (*abd_buf));
8711 if (zio->io_error != 0) {
8712 lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
8713 /*
8714 * L2BLK_GET_PSIZE returns aligned size for log
8715 * blocks.
8716 */
8717 uint64_t asize =
8718 L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
8719 bytes_dropped += asize;
8720 ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
8721 ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
8722 zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
8723 lb_ptr_buf);
8724 (void) zfs_refcount_remove(&dev->l2ad_lb_count,
8725 lb_ptr_buf);
8726 kmem_free(lb_ptr_buf->lb_ptr,
8727 sizeof (l2arc_log_blkptr_t));
8728 kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
8729 }
8730 }
8731 list_destroy(&cb->l2wcb_abd_list);
8732
8733 if (zio->io_error != 0) {
8734 ARCSTAT_BUMP(arcstat_l2_writes_error);
8735
8736 /*
8737 * Restore the lbps array in the header to its previous state.
8738 * If the list of log block pointers is empty, zero out the
8739 * log block pointers in the device header.
8740 */
8741 lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
8742 for (int i = 0; i < 2; i++) {
8743 if (lb_ptr_buf == NULL) {
8744 /*
8745 * If the list is empty zero out the device
8746 * header. Otherwise zero out the second log
8747 * block pointer in the header.
8748 */
8749 if (i == 0) {
8750 memset(l2dhdr, 0,
8751 dev->l2ad_dev_hdr_asize);
8752 } else {
8753 memset(&l2dhdr->dh_start_lbps[i], 0,
8754 sizeof (l2arc_log_blkptr_t));
8755 }
8756 break;
8757 }
8758 memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
8759 sizeof (l2arc_log_blkptr_t));
8760 lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
8761 lb_ptr_buf);
8762 }
8763 }
8764
8765 ARCSTAT_BUMP(arcstat_l2_writes_done);
8766 list_remove(buflist, head);
8767 ASSERT(!HDR_HAS_L1HDR(head));
8768 kmem_cache_free(hdr_l2only_cache, head);
8769 mutex_exit(&dev->l2ad_mtx);
8770
8771 ASSERT(dev->l2ad_vdev != NULL);
8772 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
8773
8774 l2arc_do_free_on_write(dev);
8775
8776 kmem_free(cb, sizeof (l2arc_write_callback_t));
8777 }
8778
8779 static int
l2arc_untransform(zio_t * zio,l2arc_read_callback_t * cb)8780 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
8781 {
8782 int ret;
8783 spa_t *spa = zio->io_spa;
8784 arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
8785 blkptr_t *bp = zio->io_bp;
8786 uint8_t salt[ZIO_DATA_SALT_LEN];
8787 uint8_t iv[ZIO_DATA_IV_LEN];
8788 uint8_t mac[ZIO_DATA_MAC_LEN];
8789 boolean_t no_crypt = B_FALSE;
8790
8791 /*
8792 * ZIL data is never be written to the L2ARC, so we don't need
8793 * special handling for its unique MAC storage.
8794 */
8795 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
8796 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
8797 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8798
8799 /*
8800 * If the data was encrypted, decrypt it now. Note that
8801 * we must check the bp here and not the hdr, since the
8802 * hdr does not have its encryption parameters updated
8803 * until arc_read_done().
8804 */
8805 if (BP_IS_ENCRYPTED(bp)) {
8806 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8807 ARC_HDR_USE_RESERVE);
8808
8809 zio_crypt_decode_params_bp(bp, salt, iv);
8810 zio_crypt_decode_mac_bp(bp, mac);
8811
8812 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
8813 BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
8814 salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
8815 hdr->b_l1hdr.b_pabd, &no_crypt);
8816 if (ret != 0) {
8817 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8818 goto error;
8819 }
8820
8821 /*
8822 * If we actually performed decryption, replace b_pabd
8823 * with the decrypted data. Otherwise we can just throw
8824 * our decryption buffer away.
8825 */
8826 if (!no_crypt) {
8827 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8828 arc_hdr_size(hdr), hdr);
8829 hdr->b_l1hdr.b_pabd = eabd;
8830 zio->io_abd = eabd;
8831 } else {
8832 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
8833 }
8834 }
8835
8836 /*
8837 * If the L2ARC block was compressed, but ARC compression
8838 * is disabled we decompress the data into a new buffer and
8839 * replace the existing data.
8840 */
8841 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8842 !HDR_COMPRESSION_ENABLED(hdr)) {
8843 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
8844 ARC_HDR_USE_RESERVE);
8845
8846 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
8847 hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
8848 HDR_GET_LSIZE(hdr), &hdr->b_complevel);
8849 if (ret != 0) {
8850 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
8851 goto error;
8852 }
8853
8854 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
8855 arc_hdr_size(hdr), hdr);
8856 hdr->b_l1hdr.b_pabd = cabd;
8857 zio->io_abd = cabd;
8858 zio->io_size = HDR_GET_LSIZE(hdr);
8859 }
8860
8861 return (0);
8862
8863 error:
8864 return (ret);
8865 }
8866
8867
8868 /*
8869 * A read to a cache device completed. Validate buffer contents before
8870 * handing over to the regular ARC routines.
8871 */
8872 static void
l2arc_read_done(zio_t * zio)8873 l2arc_read_done(zio_t *zio)
8874 {
8875 int tfm_error = 0;
8876 l2arc_read_callback_t *cb = zio->io_private;
8877 arc_buf_hdr_t *hdr;
8878 kmutex_t *hash_lock;
8879 boolean_t valid_cksum;
8880 boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8881 (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
8882
8883 ASSERT3P(zio->io_vd, !=, NULL);
8884 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8885
8886 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
8887
8888 ASSERT3P(cb, !=, NULL);
8889 hdr = cb->l2rcb_hdr;
8890 ASSERT3P(hdr, !=, NULL);
8891
8892 hash_lock = HDR_LOCK(hdr);
8893 mutex_enter(hash_lock);
8894 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
8895
8896 /*
8897 * If the data was read into a temporary buffer,
8898 * move it and free the buffer.
8899 */
8900 if (cb->l2rcb_abd != NULL) {
8901 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8902 if (zio->io_error == 0) {
8903 if (using_rdata) {
8904 abd_copy(hdr->b_crypt_hdr.b_rabd,
8905 cb->l2rcb_abd, arc_hdr_size(hdr));
8906 } else {
8907 abd_copy(hdr->b_l1hdr.b_pabd,
8908 cb->l2rcb_abd, arc_hdr_size(hdr));
8909 }
8910 }
8911
8912 /*
8913 * The following must be done regardless of whether
8914 * there was an error:
8915 * - free the temporary buffer
8916 * - point zio to the real ARC buffer
8917 * - set zio size accordingly
8918 * These are required because zio is either re-used for
8919 * an I/O of the block in the case of the error
8920 * or the zio is passed to arc_read_done() and it
8921 * needs real data.
8922 */
8923 abd_free(cb->l2rcb_abd);
8924 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
8925
8926 if (using_rdata) {
8927 ASSERT(HDR_HAS_RABD(hdr));
8928 zio->io_abd = zio->io_orig_abd =
8929 hdr->b_crypt_hdr.b_rabd;
8930 } else {
8931 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8932 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8933 }
8934 }
8935
8936 ASSERT3P(zio->io_abd, !=, NULL);
8937
8938 /*
8939 * Check this survived the L2ARC journey.
8940 */
8941 ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8942 (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
8943 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
8944 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
8945 zio->io_prop.zp_complevel = hdr->b_complevel;
8946
8947 valid_cksum = arc_cksum_is_equal(hdr, zio);
8948
8949 /*
8950 * b_rabd will always match the data as it exists on disk if it is
8951 * being used. Therefore if we are reading into b_rabd we do not
8952 * attempt to untransform the data.
8953 */
8954 if (valid_cksum && !using_rdata)
8955 tfm_error = l2arc_untransform(zio, cb);
8956
8957 if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8958 !HDR_L2_EVICTED(hdr)) {
8959 mutex_exit(hash_lock);
8960 zio->io_private = hdr;
8961 arc_read_done(zio);
8962 } else {
8963 /*
8964 * Buffer didn't survive caching. Increment stats and
8965 * reissue to the original storage device.
8966 */
8967 if (zio->io_error != 0) {
8968 ARCSTAT_BUMP(arcstat_l2_io_error);
8969 } else {
8970 zio->io_error = SET_ERROR(EIO);
8971 }
8972 if (!valid_cksum || tfm_error != 0)
8973 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8974
8975 /*
8976 * If there's no waiter, issue an async i/o to the primary
8977 * storage now. If there *is* a waiter, the caller must
8978 * issue the i/o in a context where it's OK to block.
8979 */
8980 if (zio->io_waiter == NULL) {
8981 zio_t *pio = zio_unique_parent(zio);
8982 void *abd = (using_rdata) ?
8983 hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
8984
8985 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8986
8987 zio = zio_read(pio, zio->io_spa, zio->io_bp,
8988 abd, zio->io_size, arc_read_done,
8989 hdr, zio->io_priority, cb->l2rcb_flags,
8990 &cb->l2rcb_zb);
8991
8992 /*
8993 * Original ZIO will be freed, so we need to update
8994 * ARC header with the new ZIO pointer to be used
8995 * by zio_change_priority() in arc_read().
8996 */
8997 for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
8998 acb != NULL; acb = acb->acb_next)
8999 acb->acb_zio_head = zio;
9000
9001 mutex_exit(hash_lock);
9002 zio_nowait(zio);
9003 } else {
9004 mutex_exit(hash_lock);
9005 }
9006 }
9007
9008 kmem_free(cb, sizeof (l2arc_read_callback_t));
9009 }
9010
9011 /*
9012 * Get the multilist for the given list number (0..3) to cycle through
9013 * lists in the desired order. This order can have a significant effect
9014 * on cache performance.
9015 *
9016 * Currently the metadata lists are hit first, MFU then MRU, followed by
9017 * the data lists.
9018 */
9019 static multilist_t *
l2arc_get_list(int list_num)9020 l2arc_get_list(int list_num)
9021 {
9022 ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
9023
9024 switch (list_num) {
9025 case 0:
9026 return (&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
9027 case 1:
9028 return (&arc_mru->arcs_list[ARC_BUFC_METADATA]);
9029 case 2:
9030 return (&arc_mfu->arcs_list[ARC_BUFC_DATA]);
9031 case 3:
9032 return (&arc_mru->arcs_list[ARC_BUFC_DATA]);
9033 default:
9034 return (NULL);
9035 }
9036 }
9037
9038
9039 /*
9040 * Lock a specific sublist within the given list number.
9041 */
9042 static multilist_sublist_t *
l2arc_sublist_lock(int list_num,int sublist_idx)9043 l2arc_sublist_lock(int list_num, int sublist_idx)
9044 {
9045 multilist_t *ml = l2arc_get_list(list_num);
9046 if (ml == NULL)
9047 return (NULL);
9048
9049 return (multilist_sublist_lock_idx(ml, sublist_idx));
9050 }
9051
9052 /*
9053 * Check if a pool has any L2ARC devices.
9054 */
9055 static boolean_t
l2arc_pool_has_devices(spa_t * target_spa)9056 l2arc_pool_has_devices(spa_t *target_spa)
9057 {
9058 l2arc_dev_t *dev;
9059
9060 ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
9061
9062 for (dev = list_head(l2arc_dev_list); dev != NULL;
9063 dev = list_next(l2arc_dev_list, dev)) {
9064 if (dev->l2ad_spa == target_spa) {
9065 return (B_TRUE);
9066 }
9067 }
9068
9069 return (B_FALSE);
9070 }
9071
9072 /*
9073 * Initialize pool-based markers for l2arc position saving.
9074 */
9075 static void
l2arc_pool_markers_init(spa_t * spa)9076 l2arc_pool_markers_init(spa_t *spa)
9077 {
9078 mutex_init(&spa->spa_l2arc_info.l2arc_sublist_lock, NULL,
9079 MUTEX_DEFAULT, NULL);
9080
9081 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9082 multilist_t *ml = l2arc_get_list(pass);
9083 if (ml == NULL)
9084 continue;
9085
9086 int num_sublists = multilist_get_num_sublists(ml);
9087
9088 spa->spa_l2arc_info.l2arc_markers[pass] =
9089 arc_state_alloc_markers(num_sublists);
9090 spa->spa_l2arc_info.l2arc_sublist_busy[pass] =
9091 kmem_zalloc(num_sublists * sizeof (boolean_t), KM_SLEEP);
9092 spa->spa_l2arc_info.l2arc_sublist_reset[pass] =
9093 kmem_zalloc(num_sublists * sizeof (boolean_t), KM_SLEEP);
9094
9095 for (int i = 0; i < num_sublists; i++) {
9096 multilist_sublist_t *mls =
9097 multilist_sublist_lock_idx(ml, i);
9098 multilist_sublist_insert_tail(mls,
9099 spa->spa_l2arc_info.l2arc_markers[pass][i]);
9100 multilist_sublist_unlock(mls);
9101 }
9102
9103 spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
9104 }
9105 }
9106
9107 /*
9108 * Free all allocated pool-based markers.
9109 */
9110 static void
l2arc_pool_markers_fini(spa_t * spa)9111 l2arc_pool_markers_fini(spa_t *spa)
9112 {
9113 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9114 if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
9115 continue;
9116
9117 multilist_t *ml = l2arc_get_list(pass);
9118 if (ml == NULL)
9119 continue;
9120
9121 int num_sublists = multilist_get_num_sublists(ml);
9122
9123 for (int i = 0; i < num_sublists; i++) {
9124 ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
9125 !=, NULL);
9126 multilist_sublist_t *mls =
9127 multilist_sublist_lock_idx(ml, i);
9128 ASSERT(multilist_link_active(
9129 &spa->spa_l2arc_info.l2arc_markers[pass][i]->
9130 b_l1hdr.b_arc_node));
9131 multilist_sublist_remove(mls,
9132 spa->spa_l2arc_info.l2arc_markers[pass][i]);
9133 multilist_sublist_unlock(mls);
9134 }
9135
9136 arc_state_free_markers(spa->spa_l2arc_info.l2arc_markers[pass],
9137 num_sublists);
9138 spa->spa_l2arc_info.l2arc_markers[pass] = NULL;
9139
9140 /* Free sublist busy and reset flags for this pass */
9141 ASSERT3P(spa->spa_l2arc_info.l2arc_sublist_busy[pass], !=,
9142 NULL);
9143 kmem_free(spa->spa_l2arc_info.l2arc_sublist_busy[pass],
9144 num_sublists * sizeof (boolean_t));
9145 spa->spa_l2arc_info.l2arc_sublist_busy[pass] = NULL;
9146
9147 ASSERT3P(spa->spa_l2arc_info.l2arc_sublist_reset[pass], !=,
9148 NULL);
9149 kmem_free(spa->spa_l2arc_info.l2arc_sublist_reset[pass],
9150 num_sublists * sizeof (boolean_t));
9151 spa->spa_l2arc_info.l2arc_sublist_reset[pass] = NULL;
9152 }
9153
9154 mutex_destroy(&spa->spa_l2arc_info.l2arc_sublist_lock);
9155 }
9156
9157 /*
9158 * Calculates the maximum overhead of L2ARC metadata log blocks for a given
9159 * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
9160 * overhead in processing to make sure there is enough headroom available
9161 * when writing buffers.
9162 */
9163 static inline uint64_t
l2arc_log_blk_overhead(uint64_t write_sz,l2arc_dev_t * dev)9164 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
9165 {
9166 if (dev->l2ad_log_entries == 0) {
9167 return (0);
9168 } else {
9169 ASSERT(dev->l2ad_vdev != NULL);
9170
9171 uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
9172
9173 uint64_t log_blocks = (log_entries +
9174 dev->l2ad_log_entries - 1) /
9175 dev->l2ad_log_entries;
9176
9177 return (vdev_psize_to_asize(dev->l2ad_vdev,
9178 sizeof (l2arc_log_blk_phys_t)) * log_blocks);
9179 }
9180 }
9181
9182 /*
9183 * Bump the DWPD generation to trigger stats reset on all devices.
9184 */
9185 void
l2arc_dwpd_bump_reset(void)9186 l2arc_dwpd_bump_reset(void)
9187 {
9188 l2arc_dwpd_bump++;
9189 }
9190
9191 /*
9192 * Calculate DWPD rate limit for L2ARC device.
9193 */
9194 static uint64_t
l2arc_dwpd_rate_limit(l2arc_dev_t * dev)9195 l2arc_dwpd_rate_limit(l2arc_dev_t *dev)
9196 {
9197 uint64_t device_size = dev->l2ad_end - dev->l2ad_start;
9198 uint64_t daily_budget = (device_size * l2arc_dwpd_limit) / 100;
9199 uint64_t now = gethrestime_sec();
9200
9201 /* Reset stats on param change or daily period expiry */
9202 if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump ||
9203 (now - dev->l2ad_dwpd_start) >= 24 * 3600) {
9204 if (dev->l2ad_dwpd_bump != l2arc_dwpd_bump) {
9205 /* Full reset on param change, no carryover */
9206 dev->l2ad_dwpd_accumulated = 0;
9207 dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
9208 } else {
9209 /* Save unused budget from last period (max 1 day) */
9210 if (dev->l2ad_dwpd_writes >= daily_budget)
9211 dev->l2ad_dwpd_accumulated = 0;
9212 else
9213 dev->l2ad_dwpd_accumulated =
9214 daily_budget - dev->l2ad_dwpd_writes;
9215 }
9216 dev->l2ad_dwpd_writes = 0;
9217 dev->l2ad_dwpd_start = now;
9218 }
9219
9220 uint64_t elapsed = now - dev->l2ad_dwpd_start;
9221 uint64_t remaining_secs = MAX((24 * 3600) - elapsed, 1);
9222 /* Add burst allowance for the first write after device wrap */
9223 uint64_t total_budget = daily_budget + dev->l2ad_dwpd_accumulated +
9224 L2ARC_BURST_SIZE_MAX;
9225
9226 if (dev->l2ad_dwpd_writes >= total_budget)
9227 return (0);
9228
9229 return ((total_budget - dev->l2ad_dwpd_writes) / remaining_secs);
9230 }
9231
9232 /*
9233 * Get write rate based on device state and DWPD configuration.
9234 */
9235 static uint64_t
l2arc_get_write_rate(l2arc_dev_t * dev)9236 l2arc_get_write_rate(l2arc_dev_t *dev)
9237 {
9238 uint64_t write_max = l2arc_write_max;
9239 spa_t *spa = dev->l2ad_spa;
9240
9241 /*
9242 * Make sure l2arc_write_max is valid in case user altered it.
9243 */
9244 if (write_max == 0) {
9245 cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
9246 "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
9247 write_max = l2arc_write_max = L2ARC_WRITE_SIZE;
9248 }
9249
9250 /* Apply DWPD rate limit for persistent marker configurations */
9251 if (!dev->l2ad_first && l2arc_dwpd_limit > 0 &&
9252 spa->spa_l2arc_info.l2arc_total_capacity >=
9253 L2ARC_PERSIST_THRESHOLD) {
9254 uint64_t dwpd_rate = l2arc_dwpd_rate_limit(dev);
9255 return (MIN(dwpd_rate, write_max));
9256 }
9257
9258 return (write_max);
9259 }
9260
9261 /*
9262 * Evict buffers from the device write hand to the distance specified in
9263 * bytes. This distance may span populated buffers, it may span nothing.
9264 * This is clearing a region on the L2ARC device ready for writing.
9265 * If the 'all' boolean is set, every buffer is evicted.
9266 */
9267 static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)9268 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
9269 {
9270 list_t *buflist;
9271 arc_buf_hdr_t *hdr, *hdr_prev;
9272 kmutex_t *hash_lock;
9273 uint64_t taddr;
9274 l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
9275 vdev_t *vd = dev->l2ad_vdev;
9276 boolean_t rerun;
9277
9278 ASSERT(vd != NULL || all);
9279 ASSERT(dev->l2ad_spa != NULL || all);
9280
9281 buflist = &dev->l2ad_buflist;
9282
9283 top:
9284 rerun = B_FALSE;
9285 if (dev->l2ad_hand + distance > dev->l2ad_end) {
9286 /*
9287 * When there is no space to accommodate upcoming writes,
9288 * evict to the end. Then bump the write and evict hands
9289 * to the start and iterate. This iteration does not
9290 * happen indefinitely as we make sure in
9291 * l2arc_write_size() that when the write hand is reset,
9292 * the write size does not exceed the end of the device.
9293 */
9294 rerun = B_TRUE;
9295 taddr = dev->l2ad_end;
9296 } else {
9297 taddr = dev->l2ad_hand + distance;
9298 }
9299 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
9300 uint64_t, taddr, boolean_t, all);
9301
9302 if (!all) {
9303 /*
9304 * This check has to be placed after deciding whether to
9305 * iterate (rerun).
9306 */
9307 if (dev->l2ad_first) {
9308 /*
9309 * This is the first sweep through the device. There is
9310 * nothing to evict. We have already trimmed the
9311 * whole device.
9312 */
9313 goto out;
9314 } else {
9315 /*
9316 * Trim the space to be evicted.
9317 */
9318 if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
9319 l2arc_trim_ahead > 0) {
9320 /*
9321 * We have to drop the spa_config lock because
9322 * vdev_trim_range() will acquire it.
9323 * l2ad_evict already accounts for the label
9324 * size. To prevent vdev_trim_ranges() from
9325 * adding it again, we subtract it from
9326 * l2ad_evict.
9327 */
9328 spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
9329 vdev_trim_simple(vd,
9330 dev->l2ad_evict - VDEV_LABEL_START_SIZE,
9331 taddr - dev->l2ad_evict);
9332 spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
9333 RW_READER);
9334 }
9335
9336 /*
9337 * When rebuilding L2ARC we retrieve the evict hand
9338 * from the header of the device. Of note, l2arc_evict()
9339 * does not actually delete buffers from the cache
9340 * device, but trimming may do so depending on the
9341 * hardware implementation. Thus keeping track of the
9342 * evict hand is useful.
9343 */
9344 dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
9345 }
9346 }
9347
9348 retry:
9349 mutex_enter(&dev->l2ad_mtx);
9350 /*
9351 * We have to account for evicted log blocks. Run vdev_space_update()
9352 * on log blocks whose offset (in bytes) is before the evicted offset
9353 * (in bytes) by searching in the list of pointers to log blocks
9354 * present in the L2ARC device.
9355 */
9356 for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
9357 lb_ptr_buf = lb_ptr_buf_prev) {
9358
9359 lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
9360
9361 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
9362 uint64_t asize = L2BLK_GET_PSIZE(
9363 (lb_ptr_buf->lb_ptr)->lbp_prop);
9364
9365 /*
9366 * We don't worry about log blocks left behind (ie
9367 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
9368 * will never write more than l2arc_evict() evicts.
9369 */
9370 if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
9371 break;
9372 } else {
9373 if (vd != NULL)
9374 vdev_space_update(vd, -asize, 0, 0);
9375 ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
9376 ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
9377 zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
9378 lb_ptr_buf);
9379 (void) zfs_refcount_remove(&dev->l2ad_lb_count,
9380 lb_ptr_buf);
9381 list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
9382 kmem_free(lb_ptr_buf->lb_ptr,
9383 sizeof (l2arc_log_blkptr_t));
9384 kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
9385 }
9386 }
9387
9388 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
9389 hdr_prev = list_prev(buflist, hdr);
9390
9391 ASSERT(!HDR_EMPTY(hdr));
9392 hash_lock = HDR_LOCK(hdr);
9393
9394 /*
9395 * We cannot use mutex_enter or else we can deadlock
9396 * with l2arc_write_buffers (due to swapping the order
9397 * the hash lock and l2ad_mtx are taken).
9398 */
9399 if (!mutex_tryenter(hash_lock)) {
9400 /*
9401 * Missed the hash lock. Retry.
9402 */
9403 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
9404 mutex_exit(&dev->l2ad_mtx);
9405 mutex_enter(hash_lock);
9406 mutex_exit(hash_lock);
9407 goto retry;
9408 }
9409
9410 /*
9411 * A header can't be on this list if it doesn't have L2 header.
9412 */
9413 ASSERT(HDR_HAS_L2HDR(hdr));
9414
9415 /* Ensure this header has finished being written. */
9416 ASSERT(!HDR_L2_WRITING(hdr));
9417 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
9418
9419 if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
9420 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
9421 /*
9422 * We've evicted to the target address,
9423 * or the end of the device.
9424 */
9425 mutex_exit(hash_lock);
9426 break;
9427 }
9428
9429 if (!HDR_HAS_L1HDR(hdr)) {
9430 ASSERT(!HDR_L2_READING(hdr));
9431 /*
9432 * This doesn't exist in the ARC. Destroy.
9433 * arc_hdr_destroy() will call list_remove()
9434 * and decrement arcstat_l2_lsize.
9435 */
9436 arc_change_state(arc_anon, hdr);
9437 arc_hdr_destroy(hdr);
9438 } else {
9439 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
9440 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
9441 /*
9442 * Invalidate issued or about to be issued
9443 * reads, since we may be about to write
9444 * over this location.
9445 */
9446 if (HDR_L2_READING(hdr)) {
9447 ARCSTAT_BUMP(arcstat_l2_evict_reading);
9448 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
9449 }
9450
9451 arc_hdr_l2hdr_destroy(hdr);
9452 }
9453 mutex_exit(hash_lock);
9454 }
9455 mutex_exit(&dev->l2ad_mtx);
9456
9457 out:
9458 /*
9459 * We need to check if we evict all buffers, otherwise we may iterate
9460 * unnecessarily.
9461 */
9462 if (!all && rerun) {
9463 /*
9464 * Bump device hand to the device start if it is approaching the
9465 * end. l2arc_evict() has already evicted ahead for this case.
9466 */
9467 dev->l2ad_hand = dev->l2ad_start;
9468 dev->l2ad_evict = dev->l2ad_start;
9469 dev->l2ad_first = B_FALSE;
9470 /*
9471 * Reset DWPD counters - first pass writes are free, start
9472 * fresh 24h budget period now that device is full.
9473 */
9474 dev->l2ad_dwpd_writes = 0;
9475 dev->l2ad_dwpd_start = gethrestime_sec();
9476 dev->l2ad_dwpd_accumulated = 0;
9477 dev->l2ad_dwpd_bump = l2arc_dwpd_bump;
9478 goto top;
9479 }
9480
9481 if (!all) {
9482 /*
9483 * In case of cache device removal (all) the following
9484 * assertions may be violated without functional consequences
9485 * as the device is about to be removed.
9486 */
9487 ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
9488 if (!dev->l2ad_first)
9489 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
9490 }
9491 }
9492
9493 /*
9494 * Handle any abd transforms that might be required for writing to the L2ARC.
9495 * If successful, this function will always return an abd with the data
9496 * transformed as it is on disk in a new abd of asize bytes.
9497 */
9498 static int
l2arc_apply_transforms(spa_t * spa,arc_buf_hdr_t * hdr,uint64_t asize,abd_t ** abd_out)9499 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
9500 abd_t **abd_out)
9501 {
9502 int ret;
9503 abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
9504 enum zio_compress compress = HDR_GET_COMPRESS(hdr);
9505 uint64_t psize = HDR_GET_PSIZE(hdr);
9506 uint64_t size = arc_hdr_size(hdr);
9507 boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
9508 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
9509 dsl_crypto_key_t *dck = NULL;
9510 uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
9511 boolean_t no_crypt = B_FALSE;
9512
9513 ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
9514 !HDR_COMPRESSION_ENABLED(hdr)) ||
9515 HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
9516 ASSERT3U(psize, <=, asize);
9517
9518 /*
9519 * If this data simply needs its own buffer, we simply allocate it
9520 * and copy the data. This may be done to eliminate a dependency on a
9521 * shared buffer or to reallocate the buffer to match asize.
9522 */
9523 if (HDR_HAS_RABD(hdr)) {
9524 ASSERT3U(asize, >, psize);
9525 to_write = abd_alloc_for_io(asize, ismd);
9526 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
9527 abd_zero_off(to_write, psize, asize - psize);
9528 goto out;
9529 }
9530
9531 if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
9532 !HDR_ENCRYPTED(hdr)) {
9533 ASSERT3U(size, ==, psize);
9534 to_write = abd_alloc_for_io(asize, ismd);
9535 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
9536 if (asize > size)
9537 abd_zero_off(to_write, size, asize - size);
9538 goto out;
9539 }
9540
9541 if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
9542 cabd = abd_alloc_for_io(MAX(size, asize), ismd);
9543 uint64_t csize = zio_compress_data(compress, to_write, &cabd,
9544 size, MIN(size, psize), hdr->b_complevel);
9545 if (csize >= size || csize > psize) {
9546 /*
9547 * We can't re-compress the block into the original
9548 * psize. Even if it fits into asize, it does not
9549 * matter, since checksum will never match on read.
9550 */
9551 abd_free(cabd);
9552 return (SET_ERROR(EIO));
9553 }
9554 if (asize > csize)
9555 abd_zero_off(cabd, csize, asize - csize);
9556 to_write = cabd;
9557 }
9558
9559 if (HDR_ENCRYPTED(hdr)) {
9560 eabd = abd_alloc_for_io(asize, ismd);
9561
9562 /*
9563 * If the dataset was disowned before the buffer
9564 * made it to this point, the key to re-encrypt
9565 * it won't be available. In this case we simply
9566 * won't write the buffer to the L2ARC.
9567 */
9568 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
9569 FTAG, &dck);
9570 if (ret != 0)
9571 goto error;
9572
9573 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
9574 hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
9575 hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
9576 &no_crypt);
9577 if (ret != 0)
9578 goto error;
9579
9580 if (no_crypt)
9581 abd_copy(eabd, to_write, psize);
9582
9583 if (psize != asize)
9584 abd_zero_off(eabd, psize, asize - psize);
9585
9586 /* assert that the MAC we got here matches the one we saved */
9587 ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
9588 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9589
9590 if (to_write == cabd)
9591 abd_free(cabd);
9592
9593 to_write = eabd;
9594 }
9595
9596 out:
9597 ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
9598 *abd_out = to_write;
9599 return (0);
9600
9601 error:
9602 if (dck != NULL)
9603 spa_keystore_dsl_key_rele(spa, dck, FTAG);
9604 if (cabd != NULL)
9605 abd_free(cabd);
9606 if (eabd != NULL)
9607 abd_free(eabd);
9608
9609 *abd_out = NULL;
9610 return (ret);
9611 }
9612
9613 /*
9614 * Write buffers from a single sublist to L2ARC.
9615 * Handles locking, marker determination, and buffer processing.
9616 * Returns B_TRUE if target size reached, B_FALSE otherwise.
9617 */
9618 static boolean_t
l2arc_write_sublist(spa_t * spa,l2arc_dev_t * dev,int pass,int sublist_idx,uint64_t target_sz,uint64_t * write_asize,uint64_t * write_psize,zio_t ** pio,l2arc_write_callback_t ** cb,arc_buf_hdr_t * head,uint64_t * consumed,uint64_t sublist_headroom,boolean_t save_position)9619 l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, int pass, int sublist_idx,
9620 uint64_t target_sz, uint64_t *write_asize, uint64_t *write_psize,
9621 zio_t **pio, l2arc_write_callback_t **cb, arc_buf_hdr_t *head,
9622 uint64_t *consumed, uint64_t sublist_headroom, boolean_t save_position)
9623 {
9624 multilist_sublist_t *mls;
9625 arc_buf_hdr_t *hdr;
9626 arc_buf_hdr_t *persistent_marker, *local_marker;
9627 boolean_t full = B_FALSE;
9628 boolean_t scan_from_head = B_FALSE;
9629 uint64_t guid = spa_load_guid(spa);
9630
9631 mls = l2arc_sublist_lock(pass, sublist_idx);
9632 ASSERT3P(mls, !=, NULL);
9633
9634 persistent_marker = spa->spa_l2arc_info.
9635 l2arc_markers[pass][sublist_idx];
9636
9637 /*
9638 * Check if this sublist's marker was flagged for reset to tail.
9639 * This handles depth cap resets and global resets without needing
9640 * to coordinate with actively-scanning threads.
9641 */
9642 if (save_position &&
9643 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx]) {
9644 multilist_sublist_remove(mls, persistent_marker);
9645 multilist_sublist_insert_tail(mls, persistent_marker);
9646 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx] =
9647 B_FALSE;
9648 }
9649
9650 if (save_position && persistent_marker == multilist_sublist_head(mls)) {
9651 multilist_sublist_unlock(mls);
9652 return (B_FALSE);
9653 }
9654
9655 local_marker = arc_state_alloc_marker();
9656
9657 if (save_position) {
9658 hdr = multilist_sublist_prev(mls, persistent_marker);
9659 ASSERT3P(hdr, !=, NULL);
9660 scan_from_head = B_FALSE;
9661 } else {
9662 if (arc_warm) {
9663 hdr = multilist_sublist_tail(mls);
9664 scan_from_head = B_FALSE;
9665 } else {
9666 hdr = multilist_sublist_head(mls);
9667 scan_from_head = B_TRUE;
9668 }
9669 ASSERT3P(hdr, !=, NULL);
9670 }
9671
9672 while (hdr != NULL) {
9673 kmutex_t *hash_lock;
9674 abd_t *to_write = NULL;
9675
9676 hash_lock = HDR_LOCK(hdr);
9677 if (!mutex_tryenter(hash_lock)) {
9678 skip:
9679 /* Skip this buffer rather than waiting. */
9680 if (scan_from_head)
9681 hdr = multilist_sublist_next(mls, hdr);
9682 else
9683 hdr = multilist_sublist_prev(mls, hdr);
9684 continue;
9685 }
9686
9687 if (l2arc_headroom != 0 &&
9688 *consumed + HDR_GET_LSIZE(hdr) >
9689 MAX(sublist_headroom, HDR_GET_LSIZE(hdr))) {
9690 /*
9691 * Searched too far in this sublist.
9692 */
9693 mutex_exit(hash_lock);
9694 break;
9695 }
9696
9697 *consumed += HDR_GET_LSIZE(hdr);
9698
9699 if (!l2arc_write_eligible(guid, hdr)) {
9700 mutex_exit(hash_lock);
9701 goto skip;
9702 }
9703
9704 ASSERT(HDR_HAS_L1HDR(hdr));
9705 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
9706 ASSERT3U(arc_hdr_size(hdr), >, 0);
9707 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
9708 uint64_t psize = HDR_GET_PSIZE(hdr);
9709 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
9710
9711 /*
9712 * If the allocated size of this buffer plus the max
9713 * size for the pending log block exceeds the evicted
9714 * target size, terminate writing buffers for this run.
9715 */
9716 if (*write_asize + asize +
9717 sizeof (l2arc_log_blk_phys_t) > target_sz) {
9718 full = B_TRUE;
9719 mutex_exit(hash_lock);
9720 break;
9721 }
9722
9723 /*
9724 * We should not sleep with sublist lock held or it
9725 * may block ARC eviction. Insert a marker to save
9726 * the position and drop the lock.
9727 */
9728 if (scan_from_head)
9729 multilist_sublist_insert_after(mls, hdr, local_marker);
9730 else
9731 multilist_sublist_insert_before(mls, hdr, local_marker);
9732 multilist_sublist_unlock(mls);
9733
9734 /*
9735 * If this header has b_rabd, we can use this since it
9736 * must always match the data exactly as it exists on
9737 * disk. Otherwise, the L2ARC can normally use the
9738 * hdr's data, but if we're sharing data between the
9739 * hdr and one of its bufs, L2ARC needs its own copy of
9740 * the data so that the ZIO below can't race with the
9741 * buf consumer. To ensure that this copy will be
9742 * available for the lifetime of the ZIO and be cleaned
9743 * up afterwards, we add it to the l2arc_free_on_write
9744 * queue. If we need to apply any transforms to the
9745 * data (compression, encryption) we will also need the
9746 * extra buffer.
9747 */
9748 if (HDR_HAS_RABD(hdr) && psize == asize) {
9749 to_write = hdr->b_crypt_hdr.b_rabd;
9750 } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
9751 HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
9752 !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
9753 psize == asize) {
9754 to_write = hdr->b_l1hdr.b_pabd;
9755 } else {
9756 int ret = l2arc_apply_transforms(spa, hdr, asize,
9757 &to_write);
9758 if (ret != 0) {
9759 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
9760 mutex_exit(hash_lock);
9761 goto next;
9762 }
9763
9764 l2arc_free_abd_on_write(to_write, dev);
9765 }
9766
9767 hdr->b_l2hdr.b_dev = dev;
9768 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
9769 hdr->b_l2hdr.b_hits = 0;
9770 hdr->b_l2hdr.b_arcs_state =
9771 hdr->b_l1hdr.b_state->arcs_state;
9772 /* l2arc_hdr_arcstats_update() expects a valid asize */
9773 HDR_SET_L2SIZE(hdr, asize);
9774 arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
9775 ARC_FLAG_L2_WRITING);
9776
9777 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
9778 arc_hdr_size(hdr), hdr);
9779 l2arc_hdr_arcstats_increment(hdr);
9780 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9781
9782 mutex_enter(&dev->l2ad_mtx);
9783 if (*pio == NULL) {
9784 /*
9785 * Insert a dummy header on the buflist so
9786 * l2arc_write_done() can find where the
9787 * write buffers begin without searching.
9788 */
9789 list_insert_head(&dev->l2ad_buflist, head);
9790 }
9791 list_insert_head(&dev->l2ad_buflist, hdr);
9792 mutex_exit(&dev->l2ad_mtx);
9793
9794 boolean_t commit = l2arc_log_blk_insert(dev, hdr);
9795 mutex_exit(hash_lock);
9796
9797 if (*pio == NULL) {
9798 *cb = kmem_alloc(sizeof (l2arc_write_callback_t),
9799 KM_SLEEP);
9800 (*cb)->l2wcb_dev = dev;
9801 (*cb)->l2wcb_head = head;
9802 list_create(&(*cb)->l2wcb_abd_list,
9803 sizeof (l2arc_lb_abd_buf_t),
9804 offsetof(l2arc_lb_abd_buf_t, node));
9805 *pio = zio_root(spa, l2arc_write_done, *cb,
9806 ZIO_FLAG_CANFAIL);
9807 }
9808
9809 zio_t *wzio = zio_write_phys(*pio, dev->l2ad_vdev,
9810 dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF,
9811 NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE,
9812 ZIO_FLAG_CANFAIL, B_FALSE);
9813
9814 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
9815 zio_t *, wzio);
9816 zio_nowait(wzio);
9817
9818 *write_psize += psize;
9819 *write_asize += asize;
9820 dev->l2ad_hand += asize;
9821
9822 if (commit) {
9823 /* l2ad_hand will be adjusted inside. */
9824 *write_asize += l2arc_log_blk_commit(dev, *pio, *cb);
9825 }
9826
9827 next:
9828 multilist_sublist_lock(mls);
9829 if (scan_from_head)
9830 hdr = multilist_sublist_next(mls, local_marker);
9831 else
9832 hdr = multilist_sublist_prev(mls, local_marker);
9833 multilist_sublist_remove(mls, local_marker);
9834 }
9835
9836 /* Reposition persistent marker for next iteration. */
9837 multilist_sublist_remove(mls, persistent_marker);
9838 if (save_position &&
9839 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx]) {
9840 /* Reset flagged during scan, restart from tail. */
9841 multilist_sublist_insert_tail(mls, persistent_marker);
9842 spa->spa_l2arc_info.l2arc_sublist_reset[pass][sublist_idx] =
9843 B_FALSE;
9844 } else if (save_position && hdr != NULL) {
9845 /*
9846 * Write budget or sublist headroom exhausted, position
9847 * marker after hdr to retry it next time.
9848 */
9849 multilist_sublist_insert_after(mls, hdr, persistent_marker);
9850 } else if (save_position) {
9851 /* End of sublist, position marker at head. */
9852 multilist_sublist_insert_head(mls, persistent_marker);
9853 } else {
9854 /* Non-persistent, reset marker to tail. */
9855 multilist_sublist_insert_tail(mls, persistent_marker);
9856 }
9857
9858 multilist_sublist_unlock(mls);
9859
9860 arc_state_free_marker(local_marker);
9861
9862 return (full);
9863 }
9864
9865 static void
l2arc_blk_fetch_done(zio_t * zio)9866 l2arc_blk_fetch_done(zio_t *zio)
9867 {
9868 l2arc_read_callback_t *cb;
9869
9870 cb = zio->io_private;
9871 if (cb->l2rcb_abd != NULL)
9872 abd_free(cb->l2rcb_abd);
9873 kmem_free(cb, sizeof (l2arc_read_callback_t));
9874 }
9875
9876 /*
9877 * Return the total size of the ARC state corresponding to the given
9878 * L2ARC pass number (0..3).
9879 */
9880 static uint64_t
l2arc_get_state_size(int pass)9881 l2arc_get_state_size(int pass)
9882 {
9883 switch (pass) {
9884 case L2ARC_MFU_META:
9885 return (zfs_refcount_count(
9886 &arc_mfu->arcs_size[ARC_BUFC_METADATA]));
9887 case L2ARC_MRU_META:
9888 return (zfs_refcount_count(
9889 &arc_mru->arcs_size[ARC_BUFC_METADATA]));
9890 case L2ARC_MFU_DATA:
9891 return (zfs_refcount_count(
9892 &arc_mfu->arcs_size[ARC_BUFC_DATA]));
9893 case L2ARC_MRU_DATA:
9894 return (zfs_refcount_count(
9895 &arc_mru->arcs_size[ARC_BUFC_DATA]));
9896 default:
9897 return (0);
9898 }
9899 }
9900
9901 /*
9902 * Flag all sublists for a single pass for lazy marker reset to tail.
9903 * Each sublist's marker will be reset when next visited by a feed thread.
9904 */
9905 static void
l2arc_flag_pass_reset(spa_t * spa,int pass)9906 l2arc_flag_pass_reset(spa_t *spa, int pass)
9907 {
9908 ASSERT(MUTEX_HELD(&spa->spa_l2arc_info.l2arc_sublist_lock));
9909
9910 multilist_t *ml = l2arc_get_list(pass);
9911 int num_sublists = multilist_get_num_sublists(ml);
9912
9913 for (int i = 0; i < num_sublists; i++) {
9914 multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
9915 spa->spa_l2arc_info.l2arc_sublist_reset[pass][i] = B_TRUE;
9916 multilist_sublist_unlock(mls);
9917 }
9918
9919 spa->spa_l2arc_info.l2arc_ext_scanned[pass] = 0;
9920 }
9921
9922 /*
9923 * Flag all L2ARC markers for lazy reset to tail for the given spa.
9924 * Each sublist's marker will be reset when next visited by a feed thread.
9925 */
9926 static void
l2arc_reset_all_markers(spa_t * spa)9927 l2arc_reset_all_markers(spa_t *spa)
9928 {
9929 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++)
9930 l2arc_flag_pass_reset(spa, pass);
9931
9932 /* Reset write counter */
9933 spa->spa_l2arc_info.l2arc_total_writes = 0;
9934 }
9935
9936 /*
9937 * Find and write ARC buffers to the L2ARC device.
9938 *
9939 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
9940 * for reading until they have completed writing.
9941 * The headroom_boost is an in-out parameter used to maintain headroom boost
9942 * state between calls to this function.
9943 *
9944 * Returns the number of bytes actually written (which may be smaller than
9945 * the delta by which the device hand has changed due to alignment and the
9946 * writing of log blocks).
9947 */
9948 static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz)9949 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
9950 {
9951 arc_buf_hdr_t *head;
9952 uint64_t write_asize, write_psize, headroom;
9953 boolean_t full;
9954 l2arc_write_callback_t *cb = NULL;
9955 zio_t *pio;
9956 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
9957
9958 ASSERT3P(dev->l2ad_vdev, !=, NULL);
9959
9960 pio = NULL;
9961 write_asize = write_psize = 0;
9962 full = B_FALSE;
9963 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
9964 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
9965
9966 /*
9967 * Determine L2ARC implementation based on total pool L2ARC capacity
9968 * vs ARC size. Use persistent markers for pools with significant
9969 * L2ARC investment, otherwise use simple HEAD/TAIL scanning.
9970 */
9971 boolean_t save_position =
9972 (spa->spa_l2arc_info.l2arc_total_capacity >=
9973 L2ARC_PERSIST_THRESHOLD);
9974
9975 /*
9976 * Check if markers need reset based on smallest device threshold.
9977 * Reset when cumulative writes exceed 1/8th of smallest device.
9978 * Must be protected since multiple device threads may check/update.
9979 */
9980 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
9981 if (save_position && spa->spa_l2arc_info.l2arc_total_writes >=
9982 spa->spa_l2arc_info.l2arc_smallest_capacity / 8) {
9983 l2arc_reset_all_markers(spa);
9984 }
9985 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
9986
9987 /*
9988 * Copy buffers for L2ARC writing.
9989 */
9990 boolean_t skip_meta = (save_position &&
9991 l2arc_meta_cycles > 0 &&
9992 dev->l2ad_meta_cycles >= l2arc_meta_cycles);
9993 if (skip_meta)
9994 dev->l2ad_meta_cycles = 0;
9995
9996 for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
9997 /*
9998 * pass == 0: MFU meta
9999 * pass == 1: MRU meta
10000 * pass == 2: MFU data
10001 * pass == 3: MRU data
10002 */
10003 if (l2arc_mfuonly == 1) {
10004 if (pass == 1 || pass == 3)
10005 continue;
10006 } else if (l2arc_mfuonly > 1) {
10007 if (pass == 3)
10008 continue;
10009 }
10010
10011 if (skip_meta && pass <= L2ARC_MRU_META)
10012 continue;
10013
10014 headroom = target_sz * l2arc_headroom;
10015 if (zfs_compressed_arc_enabled)
10016 headroom = (headroom * l2arc_headroom_boost) / 100;
10017
10018 multilist_t *ml = l2arc_get_list(pass);
10019 ASSERT3P(ml, !=, NULL);
10020 int num_sublists = multilist_get_num_sublists(ml);
10021 uint64_t consumed_headroom = 0;
10022
10023 /*
10024 * Equal per-sublist headroom prevents later
10025 * sublists from getting disproportionate shares
10026 * that would defeat the depth cap.
10027 */
10028 uint64_t sublist_headroom = headroom / num_sublists;
10029
10030 int current_sublist = spa->spa_l2arc_info.
10031 l2arc_next_sublist[pass];
10032 int processed_sublists = 0;
10033 while (processed_sublists < num_sublists && !full) {
10034 if (consumed_headroom >= headroom)
10035 break;
10036
10037 /*
10038 * Check if sublist is busy (being processed by another
10039 * L2ARC device thread). If so, skip to next sublist.
10040 */
10041 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10042 if (spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10043 [current_sublist]) {
10044 mutex_exit(&spa->spa_l2arc_info.
10045 l2arc_sublist_lock);
10046 current_sublist = (current_sublist + 1) %
10047 num_sublists;
10048 processed_sublists++;
10049 continue;
10050 }
10051 /* Mark sublist as busy */
10052 spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10053 [current_sublist] = B_TRUE;
10054 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10055
10056 /*
10057 * Write buffers from this sublist to L2ARC.
10058 * Function handles locking, marker management, and
10059 * buffer processing internally.
10060 */
10061 full = l2arc_write_sublist(spa, dev, pass,
10062 current_sublist, target_sz, &write_asize,
10063 &write_psize, &pio, &cb, head,
10064 &consumed_headroom, sublist_headroom,
10065 save_position);
10066
10067 /* Clear busy flag for this sublist */
10068 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10069 spa->spa_l2arc_info.l2arc_sublist_busy[pass]
10070 [current_sublist] = B_FALSE;
10071 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10072
10073 current_sublist = (current_sublist + 1) % num_sublists;
10074 processed_sublists++;
10075 }
10076
10077 spa->spa_l2arc_info.l2arc_next_sublist[pass] =
10078 (spa->spa_l2arc_info.l2arc_next_sublist[pass] + 1) %
10079 num_sublists;
10080
10081 /*
10082 * Count consecutive metadata monopolization toward
10083 * l2arc_meta_cycles. Only count when metadata actually
10084 * filled the write budget, starving data passes.
10085 */
10086 if (save_position && pass <= L2ARC_MRU_META && full)
10087 dev->l2ad_meta_cycles++;
10088
10089 /*
10090 * Depth cap: track cumulative bytes scanned per pass
10091 * and reset markers when the scan cap is reached.
10092 * Keeps the marker near the tail where L2ARC adds
10093 * the most value.
10094 */
10095 if (save_position) {
10096 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10097
10098 spa->spa_l2arc_info.l2arc_ext_scanned[pass] +=
10099 consumed_headroom;
10100
10101 uint64_t state_sz = l2arc_get_state_size(pass);
10102 uint64_t scan_cap =
10103 state_sz * l2arc_ext_headroom_pct / 100;
10104
10105 if (scan_cap > 0 &&
10106 spa->spa_l2arc_info.l2arc_ext_scanned[pass] >=
10107 scan_cap) {
10108 l2arc_flag_pass_reset(spa, pass);
10109 }
10110
10111 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10112 }
10113
10114 if (full == B_TRUE)
10115 break;
10116 }
10117
10118 /*
10119 * If nothing was written at all, reset monopolization counter.
10120 * No point skipping metadata if data has nothing either.
10121 */
10122 if (write_asize == 0)
10123 dev->l2ad_meta_cycles = 0;
10124
10125 /* No buffers selected for writing? */
10126 if (pio == NULL) {
10127 ASSERT0(write_psize);
10128 ASSERT(!HDR_HAS_L1HDR(head));
10129 kmem_cache_free(hdr_l2only_cache, head);
10130
10131 /*
10132 * Although we did not write any buffers l2ad_evict may
10133 * have advanced.
10134 */
10135 if (dev->l2ad_evict != l2dhdr->dh_evict)
10136 l2arc_dev_hdr_update(dev);
10137
10138 return (0);
10139 }
10140
10141 if (!dev->l2ad_first)
10142 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
10143
10144 ASSERT3U(write_asize, <=, target_sz);
10145 ARCSTAT_BUMP(arcstat_l2_writes_sent);
10146 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
10147
10148 dev->l2ad_writing = B_TRUE;
10149 (void) zio_wait(pio);
10150 dev->l2ad_writing = B_FALSE;
10151
10152 /*
10153 * Update cumulative write tracking for marker reset logic.
10154 * Protected for multi-device thread access.
10155 */
10156 mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
10157 spa->spa_l2arc_info.l2arc_total_writes += write_asize;
10158 mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
10159
10160 /* Track writes for DWPD rate limiting */
10161 dev->l2ad_dwpd_writes += write_asize;
10162
10163 /*
10164 * Update the device header after the zio completes as
10165 * l2arc_write_done() may have updated the memory holding the log block
10166 * pointers in the device header.
10167 */
10168 l2arc_dev_hdr_update(dev);
10169
10170 return (write_asize);
10171 }
10172
10173 static boolean_t
l2arc_hdr_limit_reached(void)10174 l2arc_hdr_limit_reached(void)
10175 {
10176 int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
10177
10178 return (arc_reclaim_needed() ||
10179 (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
10180 }
10181
10182 /*
10183 * Per-device L2ARC feed thread. Each L2ARC device has its own thread
10184 * to allow parallel writes to multiple devices.
10185 */
10186 static __attribute__((noreturn)) void
l2arc_feed_thread(void * arg)10187 l2arc_feed_thread(void *arg)
10188 {
10189 l2arc_dev_t *dev = arg;
10190 callb_cpr_t cpr;
10191 spa_t *spa;
10192 uint64_t size, wrote;
10193 clock_t begin, next = ddi_get_lbolt();
10194 fstrans_cookie_t cookie;
10195
10196 ASSERT3P(dev, !=, NULL);
10197
10198 CALLB_CPR_INIT(&cpr, &dev->l2ad_feed_thr_lock, callb_generic_cpr, FTAG);
10199
10200 mutex_enter(&dev->l2ad_feed_thr_lock);
10201
10202 cookie = spl_fstrans_mark();
10203 while (dev->l2ad_thread_exit == B_FALSE) {
10204 CALLB_CPR_SAFE_BEGIN(&cpr);
10205 (void) cv_timedwait_idle(&dev->l2ad_feed_cv,
10206 &dev->l2ad_feed_thr_lock, next);
10207 CALLB_CPR_SAFE_END(&cpr, &dev->l2ad_feed_thr_lock);
10208 next = ddi_get_lbolt() + hz;
10209
10210 /*
10211 * Check if thread should exit.
10212 */
10213 if (dev->l2ad_thread_exit)
10214 break;
10215
10216 /*
10217 * Check if device is still valid. If not, thread should exit.
10218 */
10219 if (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev))
10220 break;
10221 begin = ddi_get_lbolt();
10222
10223 /*
10224 * Try to acquire the spa config lock. If we can't get it,
10225 * skip this iteration as removal might be in progress.
10226 * The feed thread will exit naturally when it wakes up and
10227 * sees l2ad_thread_exit is set.
10228 */
10229 spa = dev->l2ad_spa;
10230 ASSERT3P(spa, !=, NULL);
10231 if (!spa_config_tryenter(spa, SCL_L2ARC, dev, RW_READER))
10232 continue;
10233
10234 /*
10235 * Avoid contributing to memory pressure.
10236 */
10237 if (l2arc_hdr_limit_reached()) {
10238 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
10239 spa_config_exit(spa, SCL_L2ARC, dev);
10240 continue;
10241 }
10242
10243 ARCSTAT_BUMP(arcstat_l2_feeds);
10244
10245 clock_t interval;
10246 size = l2arc_write_size(dev, &interval);
10247
10248 /*
10249 * Evict L2ARC buffers that will be overwritten.
10250 */
10251 l2arc_evict(dev, size, B_FALSE);
10252
10253 /*
10254 * Write ARC buffers.
10255 */
10256 wrote = l2arc_write_buffers(spa, dev, size);
10257
10258 /*
10259 * Adjust interval based on actual write.
10260 */
10261 if (wrote == 0)
10262 interval = hz * l2arc_feed_secs;
10263 else if (wrote < size)
10264 interval = (interval * wrote) / size;
10265
10266 /*
10267 * Calculate next feed time.
10268 */
10269 clock_t now = ddi_get_lbolt();
10270 next = MAX(now, MIN(now + interval, begin + interval));
10271 spa_config_exit(spa, SCL_L2ARC, dev);
10272 }
10273 spl_fstrans_unmark(cookie);
10274
10275 dev->l2ad_feed_thread = NULL;
10276 cv_broadcast(&dev->l2ad_feed_cv);
10277 CALLB_CPR_EXIT(&cpr); /* drops dev->l2ad_feed_thr_lock */
10278 thread_exit();
10279 }
10280
10281 boolean_t
l2arc_vdev_present(vdev_t * vd)10282 l2arc_vdev_present(vdev_t *vd)
10283 {
10284 return (l2arc_vdev_get(vd) != NULL);
10285 }
10286
10287 /*
10288 * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
10289 * the vdev_t isn't an L2ARC device.
10290 */
10291 l2arc_dev_t *
l2arc_vdev_get(vdev_t * vd)10292 l2arc_vdev_get(vdev_t *vd)
10293 {
10294 l2arc_dev_t *dev;
10295
10296 mutex_enter(&l2arc_dev_mtx);
10297 for (dev = list_head(l2arc_dev_list); dev != NULL;
10298 dev = list_next(l2arc_dev_list, dev)) {
10299 if (dev->l2ad_vdev == vd)
10300 break;
10301 }
10302 mutex_exit(&l2arc_dev_mtx);
10303
10304 return (dev);
10305 }
10306
10307 static void
l2arc_rebuild_dev(l2arc_dev_t * dev,boolean_t reopen)10308 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
10309 {
10310 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
10311 uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
10312 spa_t *spa = dev->l2ad_spa;
10313
10314 /*
10315 * After a l2arc_remove_vdev(), the spa_t will no longer be valid
10316 */
10317 if (spa == NULL)
10318 return;
10319
10320 /*
10321 * The L2ARC has to hold at least the payload of one log block for
10322 * them to be restored (persistent L2ARC). The payload of a log block
10323 * depends on the amount of its log entries. We always write log blocks
10324 * with 1022 entries. How many of them are committed or restored depends
10325 * on the size of the L2ARC device. Thus the maximum payload of
10326 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
10327 * is less than that, we reduce the amount of committed and restored
10328 * log entries per block so as to enable persistence.
10329 */
10330 if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
10331 dev->l2ad_log_entries = 0;
10332 } else {
10333 dev->l2ad_log_entries = MIN((dev->l2ad_end -
10334 dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
10335 L2ARC_LOG_BLK_MAX_ENTRIES);
10336 }
10337
10338 /*
10339 * Read the device header, if an error is returned do not rebuild L2ARC.
10340 */
10341 if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
10342 /*
10343 * If we are onlining a cache device (vdev_reopen) that was
10344 * still present (l2arc_vdev_present()) and rebuild is enabled,
10345 * we should evict all ARC buffers and pointers to log blocks
10346 * and reclaim their space before restoring its contents to
10347 * L2ARC.
10348 */
10349 if (reopen) {
10350 if (!l2arc_rebuild_enabled) {
10351 return;
10352 } else {
10353 l2arc_evict(dev, 0, B_TRUE);
10354 /* start a new log block */
10355 dev->l2ad_log_ent_idx = 0;
10356 dev->l2ad_log_blk_payload_asize = 0;
10357 dev->l2ad_log_blk_payload_start = 0;
10358 }
10359 }
10360 /*
10361 * Just mark the device as pending for a rebuild. We won't
10362 * be starting a rebuild in line here as it would block pool
10363 * import. Instead spa_load_impl will hand that off to an
10364 * async task which will call l2arc_spa_rebuild_start.
10365 */
10366 dev->l2ad_rebuild = B_TRUE;
10367 } else if (spa_writeable(spa)) {
10368 /*
10369 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
10370 * otherwise create a new header. We zero out the memory holding
10371 * the header to reset dh_start_lbps. If we TRIM the whole
10372 * device the new header will be written by
10373 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
10374 * trim_state in the header too. When reading the header, if
10375 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
10376 * we opt to TRIM the whole device again.
10377 */
10378 if (l2arc_trim_ahead > 0) {
10379 dev->l2ad_trim_all = B_TRUE;
10380 } else {
10381 memset(l2dhdr, 0, l2dhdr_asize);
10382 l2arc_dev_hdr_update(dev);
10383 }
10384 }
10385 }
10386
10387
10388 /*
10389 * Recalculate smallest L2ARC device capacity for the given spa.
10390 * Must be called under l2arc_dev_mtx.
10391 */
10392 static void
l2arc_update_smallest_capacity(spa_t * spa)10393 l2arc_update_smallest_capacity(spa_t *spa)
10394 {
10395 ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
10396 l2arc_dev_t *dev;
10397 uint64_t smallest = UINT64_MAX;
10398
10399 for (dev = list_head(l2arc_dev_list); dev != NULL;
10400 dev = list_next(l2arc_dev_list, dev)) {
10401 if (dev->l2ad_spa == spa) {
10402 uint64_t cap = dev->l2ad_end - dev->l2ad_start;
10403 if (cap < smallest)
10404 smallest = cap;
10405 }
10406 }
10407
10408 spa->spa_l2arc_info.l2arc_smallest_capacity = smallest;
10409 }
10410
10411 /*
10412 * Add a vdev for use by the L2ARC. By this point the spa has already
10413 * validated the vdev and opened it.
10414 */
10415 void
l2arc_add_vdev(spa_t * spa,vdev_t * vd)10416 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
10417 {
10418 l2arc_dev_t *adddev;
10419 uint64_t l2dhdr_asize;
10420
10421 ASSERT(!l2arc_vdev_present(vd));
10422
10423 /*
10424 * Create a new l2arc device entry.
10425 */
10426 adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
10427 adddev->l2ad_spa = spa;
10428 adddev->l2ad_vdev = vd;
10429 /* leave extra size for an l2arc device header */
10430 l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
10431 MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
10432 adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
10433 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
10434 ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
10435 adddev->l2ad_hand = adddev->l2ad_start;
10436 adddev->l2ad_evict = adddev->l2ad_start;
10437 adddev->l2ad_first = B_TRUE;
10438 adddev->l2ad_writing = B_FALSE;
10439 adddev->l2ad_trim_all = B_FALSE;
10440 adddev->l2ad_dwpd_writes = 0;
10441 adddev->l2ad_dwpd_start = gethrestime_sec();
10442 adddev->l2ad_dwpd_accumulated = 0;
10443 adddev->l2ad_dwpd_bump = l2arc_dwpd_bump;
10444 list_link_init(&adddev->l2ad_node);
10445 adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
10446
10447 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
10448 /*
10449 * This is a list of all ARC buffers that are still valid on the
10450 * device.
10451 */
10452 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
10453 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
10454
10455 /*
10456 * This is a list of pointers to log blocks that are still present
10457 * on the device.
10458 */
10459 list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
10460 offsetof(l2arc_lb_ptr_buf_t, node));
10461
10462 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
10463 zfs_refcount_create(&adddev->l2ad_alloc);
10464
10465 /*
10466 * Initialize per-device thread fields
10467 */
10468 adddev->l2ad_thread_exit = B_FALSE;
10469 mutex_init(&adddev->l2ad_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10470 cv_init(&adddev->l2ad_feed_cv, NULL, CV_DEFAULT, NULL);
10471
10472 zfs_refcount_create(&adddev->l2ad_lb_asize);
10473 zfs_refcount_create(&adddev->l2ad_lb_count);
10474
10475 /*
10476 * Decide if dev is eligible for L2ARC rebuild or whole device
10477 * trimming. This has to happen before the device is added in the
10478 * cache device list and l2arc_dev_mtx is released. Otherwise
10479 * l2arc_feed_thread() might already start writing on the
10480 * device.
10481 */
10482 l2arc_rebuild_dev(adddev, B_FALSE);
10483
10484 /*
10485 * Add device to global list
10486 */
10487 mutex_enter(&l2arc_dev_mtx);
10488
10489 /*
10490 * Initialize pool-based position saving markers if this is the first
10491 * L2ARC device for this pool
10492 */
10493 if (!l2arc_pool_has_devices(spa)) {
10494 l2arc_pool_markers_init(spa);
10495 }
10496
10497 list_insert_head(l2arc_dev_list, adddev);
10498 atomic_inc_64(&l2arc_ndev);
10499 spa->spa_l2arc_info.l2arc_total_capacity += (adddev->l2ad_end -
10500 adddev->l2ad_start);
10501 l2arc_update_smallest_capacity(spa);
10502
10503 /*
10504 * Create per-device feed thread only if spa is writable.
10505 * The thread name includes the spa name and device number
10506 * for easy identification.
10507 */
10508 if (spa_writeable(spa)) {
10509 char thread_name[MAXNAMELEN];
10510 snprintf(thread_name, sizeof (thread_name), "l2arc_%s_%llu",
10511 spa_name(spa), (u_longlong_t)vd->vdev_id);
10512 adddev->l2ad_feed_thread = thread_create_named(thread_name,
10513 NULL, 0, l2arc_feed_thread, adddev, 0, &p0, TS_RUN,
10514 minclsyspri);
10515 if (adddev->l2ad_feed_thread == NULL) {
10516 cmn_err(CE_WARN, "l2arc: failed to create feed thread "
10517 "for vdev %llu in pool '%s'",
10518 (u_longlong_t)vd->vdev_id, spa_name(spa));
10519 }
10520 } else {
10521 adddev->l2ad_feed_thread = NULL;
10522 }
10523
10524 mutex_exit(&l2arc_dev_mtx);
10525 }
10526
10527 /*
10528 * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
10529 * in case of onlining a cache device.
10530 */
10531 void
l2arc_rebuild_vdev(vdev_t * vd,boolean_t reopen)10532 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
10533 {
10534 l2arc_dev_t *dev = NULL;
10535
10536 dev = l2arc_vdev_get(vd);
10537 ASSERT3P(dev, !=, NULL);
10538
10539 /*
10540 * In contrast to l2arc_add_vdev() we do not have to worry about
10541 * l2arc_feed_thread() invalidating previous content when onlining a
10542 * cache device. The device parameters (l2ad*) are not cleared when
10543 * offlining the device and writing new buffers will not invalidate
10544 * all previous content. In worst case only buffers that have not had
10545 * their log block written to the device will be lost.
10546 * When onlining the cache device (ie offline->online without exporting
10547 * the pool in between) this happens:
10548 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
10549 * | |
10550 * vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE
10551 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
10552 * is set to B_TRUE we might write additional buffers to the device.
10553 */
10554 l2arc_rebuild_dev(dev, reopen);
10555 }
10556
10557 typedef struct {
10558 l2arc_dev_t *rva_l2arc_dev;
10559 uint64_t rva_spa_gid;
10560 uint64_t rva_vdev_gid;
10561 boolean_t rva_async;
10562
10563 } remove_vdev_args_t;
10564
10565 static void
l2arc_device_teardown(void * arg)10566 l2arc_device_teardown(void *arg)
10567 {
10568 remove_vdev_args_t *rva = arg;
10569 l2arc_dev_t *remdev = rva->rva_l2arc_dev;
10570 hrtime_t start_time = gethrtime();
10571
10572 /*
10573 * Clear all buflists and ARC references. L2ARC device flush.
10574 */
10575 l2arc_evict(remdev, 0, B_TRUE);
10576 list_destroy(&remdev->l2ad_buflist);
10577 ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
10578 list_destroy(&remdev->l2ad_lbptr_list);
10579 mutex_destroy(&remdev->l2ad_mtx);
10580 mutex_destroy(&remdev->l2ad_feed_thr_lock);
10581 cv_destroy(&remdev->l2ad_feed_cv);
10582 zfs_refcount_destroy(&remdev->l2ad_alloc);
10583 zfs_refcount_destroy(&remdev->l2ad_lb_asize);
10584 zfs_refcount_destroy(&remdev->l2ad_lb_count);
10585 kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
10586 vmem_free(remdev, sizeof (l2arc_dev_t));
10587
10588 uint64_t elapsed = NSEC2MSEC(gethrtime() - start_time);
10589 if (elapsed > 0) {
10590 zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
10591 (u_longlong_t)rva->rva_spa_gid,
10592 (u_longlong_t)rva->rva_vdev_gid,
10593 (u_longlong_t)elapsed);
10594 }
10595
10596 if (rva->rva_async)
10597 arc_async_flush_remove(rva->rva_spa_gid, 2);
10598 kmem_free(rva, sizeof (remove_vdev_args_t));
10599 }
10600
10601 /*
10602 * Remove a vdev from the L2ARC.
10603 */
10604 void
l2arc_remove_vdev(vdev_t * vd)10605 l2arc_remove_vdev(vdev_t *vd)
10606 {
10607 spa_t *spa = vd->vdev_spa;
10608 boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED ||
10609 spa->spa_state == POOL_STATE_DESTROYED;
10610
10611 /*
10612 * Find the device by vdev
10613 */
10614 l2arc_dev_t *remdev = l2arc_vdev_get(vd);
10615 ASSERT3P(remdev, !=, NULL);
10616
10617 /*
10618 * Save info for final teardown
10619 */
10620 remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
10621 KM_SLEEP);
10622 rva->rva_l2arc_dev = remdev;
10623 rva->rva_spa_gid = spa_load_guid(spa);
10624 rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
10625
10626 /*
10627 * Cancel any ongoing or scheduled rebuild.
10628 */
10629 mutex_enter(&l2arc_rebuild_thr_lock);
10630 remdev->l2ad_rebuild_cancel = B_TRUE;
10631 if (remdev->l2ad_rebuild_began == B_TRUE) {
10632 while (remdev->l2ad_rebuild == B_TRUE)
10633 cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
10634 }
10635 mutex_exit(&l2arc_rebuild_thr_lock);
10636
10637 /*
10638 * Signal per-device feed thread to exit and wait for it.
10639 * Thread only exists if pool was imported read-write.
10640 */
10641 if (remdev->l2ad_feed_thread != NULL) {
10642 mutex_enter(&remdev->l2ad_feed_thr_lock);
10643 remdev->l2ad_thread_exit = B_TRUE;
10644 cv_signal(&remdev->l2ad_feed_cv);
10645 while (remdev->l2ad_feed_thread != NULL)
10646 cv_wait(&remdev->l2ad_feed_cv,
10647 &remdev->l2ad_feed_thr_lock);
10648 mutex_exit(&remdev->l2ad_feed_thr_lock);
10649 }
10650
10651 rva->rva_async = asynchronous;
10652
10653 /*
10654 * Remove device from global list
10655 */
10656 ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
10657 mutex_enter(&l2arc_dev_mtx);
10658 list_remove(l2arc_dev_list, remdev);
10659 atomic_dec_64(&l2arc_ndev);
10660 spa->spa_l2arc_info.l2arc_total_capacity -=
10661 (remdev->l2ad_end - remdev->l2ad_start);
10662 l2arc_update_smallest_capacity(spa);
10663
10664 /*
10665 * Clean up pool-based markers if this was the last L2ARC device
10666 * for this pool
10667 */
10668 if (!l2arc_pool_has_devices(spa)) {
10669 l2arc_pool_markers_fini(spa);
10670 }
10671
10672 /* During a pool export spa & vdev will no longer be valid */
10673 if (asynchronous) {
10674 remdev->l2ad_spa = NULL;
10675 remdev->l2ad_vdev = NULL;
10676 }
10677 mutex_exit(&l2arc_dev_mtx);
10678
10679 if (!asynchronous) {
10680 l2arc_device_teardown(rva);
10681 return;
10682 }
10683
10684 arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2);
10685
10686 taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva,
10687 TQ_SLEEP, &af->af_tqent);
10688 }
10689
10690 void
l2arc_init(void)10691 l2arc_init(void)
10692 {
10693 l2arc_ndev = 0;
10694
10695 mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
10696 cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
10697 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
10698 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
10699
10700 l2arc_dev_list = &L2ARC_dev_list;
10701 l2arc_free_on_write = &L2ARC_free_on_write;
10702 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
10703 offsetof(l2arc_dev_t, l2ad_node));
10704 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
10705 offsetof(l2arc_data_free_t, l2df_list_node));
10706 }
10707
10708 void
l2arc_fini(void)10709 l2arc_fini(void)
10710 {
10711 mutex_destroy(&l2arc_rebuild_thr_lock);
10712 cv_destroy(&l2arc_rebuild_thr_cv);
10713 mutex_destroy(&l2arc_dev_mtx);
10714 mutex_destroy(&l2arc_free_on_write_mtx);
10715
10716 list_destroy(l2arc_dev_list);
10717 list_destroy(l2arc_free_on_write);
10718 }
10719
10720
10721 /*
10722 * Punches out rebuild threads for the L2ARC devices in a spa. This should
10723 * be called after pool import from the spa async thread, since starting
10724 * these threads directly from spa_import() will make them part of the
10725 * "zpool import" context and delay process exit (and thus pool import).
10726 */
10727 void
l2arc_spa_rebuild_start(spa_t * spa)10728 l2arc_spa_rebuild_start(spa_t *spa)
10729 {
10730 ASSERT(spa_namespace_held());
10731
10732 /*
10733 * Locate the spa's l2arc devices and kick off rebuild threads.
10734 */
10735 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10736 l2arc_dev_t *dev =
10737 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10738 if (dev == NULL) {
10739 /* Don't attempt a rebuild if the vdev is UNAVAIL */
10740 continue;
10741 }
10742 mutex_enter(&l2arc_rebuild_thr_lock);
10743 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
10744 dev->l2ad_rebuild_began = B_TRUE;
10745 (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
10746 dev, 0, &p0, TS_RUN, minclsyspri);
10747 }
10748 mutex_exit(&l2arc_rebuild_thr_lock);
10749 }
10750 }
10751
10752 void
l2arc_spa_rebuild_stop(spa_t * spa)10753 l2arc_spa_rebuild_stop(spa_t *spa)
10754 {
10755 ASSERT(spa_namespace_held() ||
10756 spa->spa_export_thread == curthread);
10757
10758 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10759 l2arc_dev_t *dev =
10760 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10761 if (dev == NULL)
10762 continue;
10763 mutex_enter(&l2arc_rebuild_thr_lock);
10764 dev->l2ad_rebuild_cancel = B_TRUE;
10765 mutex_exit(&l2arc_rebuild_thr_lock);
10766 }
10767 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
10768 l2arc_dev_t *dev =
10769 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
10770 if (dev == NULL)
10771 continue;
10772 mutex_enter(&l2arc_rebuild_thr_lock);
10773 if (dev->l2ad_rebuild_began == B_TRUE) {
10774 while (dev->l2ad_rebuild == B_TRUE) {
10775 cv_wait(&l2arc_rebuild_thr_cv,
10776 &l2arc_rebuild_thr_lock);
10777 }
10778 }
10779 mutex_exit(&l2arc_rebuild_thr_lock);
10780 }
10781 }
10782
10783 /*
10784 * Main entry point for L2ARC rebuilding.
10785 */
10786 static __attribute__((noreturn)) void
l2arc_dev_rebuild_thread(void * arg)10787 l2arc_dev_rebuild_thread(void *arg)
10788 {
10789 l2arc_dev_t *dev = arg;
10790
10791 VERIFY(dev->l2ad_rebuild);
10792 (void) l2arc_rebuild(dev);
10793 mutex_enter(&l2arc_rebuild_thr_lock);
10794 dev->l2ad_rebuild_began = B_FALSE;
10795 dev->l2ad_rebuild = B_FALSE;
10796 cv_signal(&l2arc_rebuild_thr_cv);
10797 mutex_exit(&l2arc_rebuild_thr_lock);
10798
10799 thread_exit();
10800 }
10801
10802 /*
10803 * This function implements the actual L2ARC metadata rebuild. It:
10804 * starts reading the log block chain and restores each block's contents
10805 * to memory (reconstructing arc_buf_hdr_t's).
10806 *
10807 * Operation stops under any of the following conditions:
10808 *
10809 * 1) We reach the end of the log block chain.
10810 * 2) We encounter *any* error condition (cksum errors, io errors)
10811 */
10812 static int
l2arc_rebuild(l2arc_dev_t * dev)10813 l2arc_rebuild(l2arc_dev_t *dev)
10814 {
10815 vdev_t *vd = dev->l2ad_vdev;
10816 spa_t *spa = vd->vdev_spa;
10817 int err = 0;
10818 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
10819 l2arc_log_blk_phys_t *this_lb, *next_lb;
10820 zio_t *this_io = NULL, *next_io = NULL;
10821 l2arc_log_blkptr_t lbps[2];
10822 l2arc_lb_ptr_buf_t *lb_ptr_buf;
10823 boolean_t lock_held;
10824
10825 this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
10826 next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
10827
10828 /*
10829 * We prevent device removal while issuing reads to the device,
10830 * then during the rebuilding phases we drop this lock again so
10831 * that a spa_unload or device remove can be initiated - this is
10832 * safe, because the spa will signal us to stop before removing
10833 * our device and wait for us to stop.
10834 */
10835 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
10836 lock_held = B_TRUE;
10837
10838 /*
10839 * Retrieve the persistent L2ARC device state.
10840 * L2BLK_GET_PSIZE returns aligned size for log blocks.
10841 */
10842 dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
10843 dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
10844 L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
10845 dev->l2ad_start);
10846 dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
10847
10848 vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
10849 vd->vdev_trim_state = l2dhdr->dh_trim_state;
10850
10851 /*
10852 * In case the zfs module parameter l2arc_rebuild_enabled is false
10853 * we do not start the rebuild process.
10854 */
10855 if (!l2arc_rebuild_enabled)
10856 goto out;
10857
10858 /* Prepare the rebuild process */
10859 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
10860
10861 /* Start the rebuild process */
10862 for (;;) {
10863 if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
10864 break;
10865
10866 if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
10867 this_lb, next_lb, this_io, &next_io)) != 0)
10868 goto out;
10869
10870 /*
10871 * Our memory pressure valve. If the system is running low
10872 * on memory, rather than swamping memory with new ARC buf
10873 * hdrs, we opt not to rebuild the L2ARC. At this point,
10874 * however, we have already set up our L2ARC dev to chain in
10875 * new metadata log blocks, so the user may choose to offline/
10876 * online the L2ARC dev at a later time (or re-import the pool)
10877 * to reconstruct it (when there's less memory pressure).
10878 */
10879 if (l2arc_hdr_limit_reached()) {
10880 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
10881 cmn_err(CE_NOTE, "System running low on memory, "
10882 "aborting L2ARC rebuild.");
10883 err = SET_ERROR(ENOMEM);
10884 goto out;
10885 }
10886
10887 spa_config_exit(spa, SCL_L2ARC, vd);
10888 lock_held = B_FALSE;
10889
10890 /*
10891 * Now that we know that the next_lb checks out alright, we
10892 * can start reconstruction from this log block.
10893 * L2BLK_GET_PSIZE returns aligned size for log blocks.
10894 */
10895 uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
10896 l2arc_log_blk_restore(dev, this_lb, asize);
10897
10898 /*
10899 * log block restored, include its pointer in the list of
10900 * pointers to log blocks present in the L2ARC device.
10901 */
10902 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
10903 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
10904 KM_SLEEP);
10905 memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
10906 sizeof (l2arc_log_blkptr_t));
10907 mutex_enter(&dev->l2ad_mtx);
10908 list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
10909 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
10910 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
10911 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
10912 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
10913 mutex_exit(&dev->l2ad_mtx);
10914 vdev_space_update(vd, asize, 0, 0);
10915
10916 /*
10917 * Protection against loops of log blocks:
10918 *
10919 * l2ad_hand l2ad_evict
10920 * V V
10921 * l2ad_start |=======================================| l2ad_end
10922 * -----|||----|||---|||----|||
10923 * (3) (2) (1) (0)
10924 * ---|||---|||----|||---|||
10925 * (7) (6) (5) (4)
10926 *
10927 * In this situation the pointer of log block (4) passes
10928 * l2arc_log_blkptr_valid() but the log block should not be
10929 * restored as it is overwritten by the payload of log block
10930 * (0). Only log blocks (0)-(3) should be restored. We check
10931 * whether l2ad_evict lies in between the payload starting
10932 * offset of the next log block (lbps[1].lbp_payload_start)
10933 * and the payload starting offset of the present log block
10934 * (lbps[0].lbp_payload_start). If true and this isn't the
10935 * first pass, we are looping from the beginning and we should
10936 * stop.
10937 */
10938 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
10939 lbps[0].lbp_payload_start, dev->l2ad_evict) &&
10940 !dev->l2ad_first)
10941 goto out;
10942
10943 kpreempt(KPREEMPT_SYNC);
10944 for (;;) {
10945 mutex_enter(&l2arc_rebuild_thr_lock);
10946 if (dev->l2ad_rebuild_cancel) {
10947 mutex_exit(&l2arc_rebuild_thr_lock);
10948 err = SET_ERROR(ECANCELED);
10949 goto out;
10950 }
10951 mutex_exit(&l2arc_rebuild_thr_lock);
10952 if (spa_config_tryenter(spa, SCL_L2ARC, vd,
10953 RW_READER)) {
10954 lock_held = B_TRUE;
10955 break;
10956 }
10957 /*
10958 * L2ARC config lock held by somebody in writer,
10959 * possibly due to them trying to remove us. They'll
10960 * likely to want us to shut down, so after a little
10961 * delay, we check l2ad_rebuild_cancel and retry
10962 * the lock again.
10963 */
10964 delay(1);
10965 }
10966
10967 /*
10968 * Continue with the next log block.
10969 */
10970 lbps[0] = lbps[1];
10971 lbps[1] = this_lb->lb_prev_lbp;
10972 PTR_SWAP(this_lb, next_lb);
10973 this_io = next_io;
10974 next_io = NULL;
10975 }
10976
10977 if (this_io != NULL)
10978 l2arc_log_blk_fetch_abort(this_io);
10979 out:
10980 if (next_io != NULL)
10981 l2arc_log_blk_fetch_abort(next_io);
10982 vmem_free(this_lb, sizeof (*this_lb));
10983 vmem_free(next_lb, sizeof (*next_lb));
10984
10985 if (err == ECANCELED) {
10986 /*
10987 * In case the rebuild was canceled do not log to spa history
10988 * log as the pool may be in the process of being removed.
10989 */
10990 zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
10991 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
10992 return (err);
10993 } else if (!l2arc_rebuild_enabled) {
10994 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10995 "disabled");
10996 } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
10997 ARCSTAT_BUMP(arcstat_l2_rebuild_success);
10998 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
10999 "successful, restored %llu blocks",
11000 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
11001 } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
11002 /*
11003 * No error but also nothing restored, meaning the lbps array
11004 * in the device header points to invalid/non-present log
11005 * blocks. Reset the header.
11006 */
11007 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
11008 "no valid log blocks");
11009 memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
11010 l2arc_dev_hdr_update(dev);
11011 } else if (err != 0) {
11012 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
11013 "aborted, restored %llu blocks",
11014 (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
11015 }
11016
11017 if (lock_held)
11018 spa_config_exit(spa, SCL_L2ARC, vd);
11019
11020 return (err);
11021 }
11022
11023 /*
11024 * Attempts to read the device header on the provided L2ARC device and writes
11025 * it to `hdr'. On success, this function returns 0, otherwise the appropriate
11026 * error code is returned.
11027 */
11028 static int
l2arc_dev_hdr_read(l2arc_dev_t * dev)11029 l2arc_dev_hdr_read(l2arc_dev_t *dev)
11030 {
11031 int err;
11032 uint64_t guid;
11033 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11034 const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
11035 abd_t *abd;
11036
11037 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
11038
11039 abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
11040
11041 err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
11042 VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
11043 ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
11044 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
11045 ZIO_FLAG_SPECULATIVE, B_FALSE));
11046
11047 abd_free(abd);
11048
11049 if (err != 0) {
11050 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
11051 zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
11052 "vdev guid: %llu", err,
11053 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11054 return (err);
11055 }
11056
11057 if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
11058 byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
11059
11060 if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
11061 l2dhdr->dh_spa_guid != guid ||
11062 l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
11063 l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
11064 l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
11065 l2dhdr->dh_end != dev->l2ad_end ||
11066 !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
11067 l2dhdr->dh_evict) ||
11068 (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
11069 l2arc_trim_ahead > 0)) {
11070 /*
11071 * Attempt to rebuild a device containing no actual dev hdr
11072 * or containing a header from some other pool or from another
11073 * version of persistent L2ARC.
11074 */
11075 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
11076 return (SET_ERROR(ENOTSUP));
11077 }
11078
11079 return (0);
11080 }
11081
11082 /*
11083 * Reads L2ARC log blocks from storage and validates their contents.
11084 *
11085 * This function implements a simple fetcher to make sure that while
11086 * we're processing one buffer the L2ARC is already fetching the next
11087 * one in the chain.
11088 *
11089 * The arguments this_lp and next_lp point to the current and next log block
11090 * address in the block chain. Similarly, this_lb and next_lb hold the
11091 * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
11092 *
11093 * The `this_io' and `next_io' arguments are used for block fetching.
11094 * When issuing the first blk IO during rebuild, you should pass NULL for
11095 * `this_io'. This function will then issue a sync IO to read the block and
11096 * also issue an async IO to fetch the next block in the block chain. The
11097 * fetched IO is returned in `next_io'. On subsequent calls to this
11098 * function, pass the value returned in `next_io' from the previous call
11099 * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
11100 * Prior to the call, you should initialize your `next_io' pointer to be
11101 * NULL. If no fetch IO was issued, the pointer is left set at NULL.
11102 *
11103 * On success, this function returns 0, otherwise it returns an appropriate
11104 * error code. On error the fetching IO is aborted and cleared before
11105 * returning from this function. Therefore, if we return `success', the
11106 * caller can assume that we have taken care of cleanup of fetch IOs.
11107 */
11108 static int
l2arc_log_blk_read(l2arc_dev_t * dev,const l2arc_log_blkptr_t * this_lbp,const l2arc_log_blkptr_t * next_lbp,l2arc_log_blk_phys_t * this_lb,l2arc_log_blk_phys_t * next_lb,zio_t * this_io,zio_t ** next_io)11109 l2arc_log_blk_read(l2arc_dev_t *dev,
11110 const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
11111 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
11112 zio_t *this_io, zio_t **next_io)
11113 {
11114 int err = 0;
11115 zio_cksum_t cksum;
11116 uint64_t asize;
11117
11118 ASSERT(this_lbp != NULL && next_lbp != NULL);
11119 ASSERT(this_lb != NULL && next_lb != NULL);
11120 ASSERT(next_io != NULL && *next_io == NULL);
11121 ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
11122
11123 /*
11124 * Check to see if we have issued the IO for this log block in a
11125 * previous run. If not, this is the first call, so issue it now.
11126 */
11127 if (this_io == NULL) {
11128 this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
11129 this_lb);
11130 }
11131
11132 /*
11133 * Peek to see if we can start issuing the next IO immediately.
11134 */
11135 if (l2arc_log_blkptr_valid(dev, next_lbp)) {
11136 /*
11137 * Start issuing IO for the next log block early - this
11138 * should help keep the L2ARC device busy while we
11139 * decompress and restore this log block.
11140 */
11141 *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
11142 next_lb);
11143 }
11144
11145 /* Wait for the IO to read this log block to complete */
11146 if ((err = zio_wait(this_io)) != 0) {
11147 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
11148 zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
11149 "offset: %llu, vdev guid: %llu", err,
11150 (u_longlong_t)this_lbp->lbp_daddr,
11151 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11152 goto cleanup;
11153 }
11154
11155 /*
11156 * Make sure the buffer checks out.
11157 * L2BLK_GET_PSIZE returns aligned size for log blocks.
11158 */
11159 asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
11160 fletcher_4_native(this_lb, asize, NULL, &cksum);
11161 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
11162 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
11163 zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
11164 "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
11165 (u_longlong_t)this_lbp->lbp_daddr,
11166 (u_longlong_t)dev->l2ad_vdev->vdev_guid,
11167 (u_longlong_t)dev->l2ad_hand,
11168 (u_longlong_t)dev->l2ad_evict);
11169 err = SET_ERROR(ECKSUM);
11170 goto cleanup;
11171 }
11172
11173 /* Now we can take our time decoding this buffer */
11174 switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
11175 case ZIO_COMPRESS_OFF:
11176 break;
11177 case ZIO_COMPRESS_LZ4: {
11178 abd_t *abd = abd_alloc_linear(asize, B_TRUE);
11179 abd_copy_from_buf_off(abd, this_lb, 0, asize);
11180 abd_t dabd;
11181 abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
11182 err = zio_decompress_data(
11183 L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
11184 abd, &dabd, asize, sizeof (*this_lb), NULL);
11185 abd_free(&dabd);
11186 abd_free(abd);
11187 if (err != 0) {
11188 err = SET_ERROR(EINVAL);
11189 goto cleanup;
11190 }
11191 break;
11192 }
11193 default:
11194 err = SET_ERROR(EINVAL);
11195 goto cleanup;
11196 }
11197 if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
11198 byteswap_uint64_array(this_lb, sizeof (*this_lb));
11199 if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
11200 err = SET_ERROR(EINVAL);
11201 goto cleanup;
11202 }
11203 cleanup:
11204 /* Abort an in-flight fetch I/O in case of error */
11205 if (err != 0 && *next_io != NULL) {
11206 l2arc_log_blk_fetch_abort(*next_io);
11207 *next_io = NULL;
11208 }
11209 return (err);
11210 }
11211
11212 /*
11213 * Restores the payload of a log block to ARC. This creates empty ARC hdr
11214 * entries which only contain an l2arc hdr, essentially restoring the
11215 * buffers to their L2ARC evicted state. This function also updates space
11216 * usage on the L2ARC vdev to make sure it tracks restored buffers.
11217 */
11218 static void
l2arc_log_blk_restore(l2arc_dev_t * dev,const l2arc_log_blk_phys_t * lb,uint64_t lb_asize)11219 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
11220 uint64_t lb_asize)
11221 {
11222 uint64_t size = 0, asize = 0;
11223 uint64_t log_entries = dev->l2ad_log_entries;
11224
11225 /*
11226 * Usually arc_adapt() is called only for data, not headers, but
11227 * since we may allocate significant amount of memory here, let ARC
11228 * grow its arc_c.
11229 */
11230 arc_adapt(log_entries * HDR_L2ONLY_SIZE);
11231
11232 for (int i = log_entries - 1; i >= 0; i--) {
11233 /*
11234 * Restore goes in the reverse temporal direction to preserve
11235 * correct temporal ordering of buffers in the l2ad_buflist.
11236 * l2arc_hdr_restore also does a list_insert_tail instead of
11237 * list_insert_head on the l2ad_buflist:
11238 *
11239 * LIST l2ad_buflist LIST
11240 * HEAD <------ (time) ------ TAIL
11241 * direction +-----+-----+-----+-----+-----+ direction
11242 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
11243 * fill +-----+-----+-----+-----+-----+
11244 * ^ ^
11245 * | |
11246 * | |
11247 * l2arc_feed_thread l2arc_rebuild
11248 * will place new bufs here restores bufs here
11249 *
11250 * During l2arc_rebuild() the device is not used by
11251 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
11252 */
11253 size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
11254 asize += vdev_psize_to_asize(dev->l2ad_vdev,
11255 L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
11256 l2arc_hdr_restore(&lb->lb_entries[i], dev);
11257 }
11258
11259 /*
11260 * Record rebuild stats:
11261 * size Logical size of restored buffers in the L2ARC
11262 * asize Aligned size of restored buffers in the L2ARC
11263 */
11264 ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
11265 ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
11266 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
11267 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
11268 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
11269 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
11270 }
11271
11272 /*
11273 * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
11274 * into a state indicating that it has been evicted to L2ARC.
11275 */
11276 static void
l2arc_hdr_restore(const l2arc_log_ent_phys_t * le,l2arc_dev_t * dev)11277 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
11278 {
11279 arc_buf_hdr_t *hdr, *exists;
11280 kmutex_t *hash_lock;
11281 arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
11282 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
11283 L2BLK_GET_PSIZE((le)->le_prop));
11284
11285 /*
11286 * Do all the allocation before grabbing any locks, this lets us
11287 * sleep if memory is full and we don't have to deal with failed
11288 * allocations.
11289 */
11290 hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
11291 dev, le->le_dva, le->le_daddr,
11292 L2BLK_GET_PSIZE((le)->le_prop), asize, le->le_birth,
11293 L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
11294 L2BLK_GET_PROTECTED((le)->le_prop),
11295 L2BLK_GET_PREFETCH((le)->le_prop),
11296 L2BLK_GET_STATE((le)->le_prop));
11297
11298 /*
11299 * vdev_space_update() has to be called before arc_hdr_destroy() to
11300 * avoid underflow since the latter also calls vdev_space_update().
11301 */
11302 l2arc_hdr_arcstats_increment(hdr);
11303 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11304
11305 mutex_enter(&dev->l2ad_mtx);
11306 list_insert_tail(&dev->l2ad_buflist, hdr);
11307 (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
11308 mutex_exit(&dev->l2ad_mtx);
11309
11310 exists = buf_hash_insert(hdr, &hash_lock);
11311 if (exists) {
11312 /* Buffer was already cached, no need to restore it. */
11313 arc_hdr_destroy(hdr);
11314 /*
11315 * If the buffer is already cached, check whether it has
11316 * L2ARC metadata. If not, enter them and update the flag.
11317 * This is important is case of onlining a cache device, since
11318 * we previously evicted all L2ARC metadata from ARC.
11319 */
11320 if (!HDR_HAS_L2HDR(exists)) {
11321 arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
11322 exists->b_l2hdr.b_dev = dev;
11323 exists->b_l2hdr.b_daddr = le->le_daddr;
11324 exists->b_l2hdr.b_arcs_state =
11325 L2BLK_GET_STATE((le)->le_prop);
11326 /* l2arc_hdr_arcstats_update() expects a valid asize */
11327 HDR_SET_L2SIZE(exists, asize);
11328 mutex_enter(&dev->l2ad_mtx);
11329 list_insert_tail(&dev->l2ad_buflist, exists);
11330 (void) zfs_refcount_add_many(&dev->l2ad_alloc,
11331 arc_hdr_size(exists), exists);
11332 mutex_exit(&dev->l2ad_mtx);
11333 l2arc_hdr_arcstats_increment(exists);
11334 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11335 }
11336 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
11337 }
11338
11339 mutex_exit(hash_lock);
11340 }
11341
11342 /*
11343 * Starts an asynchronous read IO to read a log block. This is used in log
11344 * block reconstruction to start reading the next block before we are done
11345 * decoding and reconstructing the current block, to keep the l2arc device
11346 * nice and hot with read IO to process.
11347 * The returned zio will contain a newly allocated memory buffers for the IO
11348 * data which should then be freed by the caller once the zio is no longer
11349 * needed (i.e. due to it having completed). If you wish to abort this
11350 * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
11351 * care of disposing of the allocated buffers correctly.
11352 */
11353 static zio_t *
l2arc_log_blk_fetch(vdev_t * vd,const l2arc_log_blkptr_t * lbp,l2arc_log_blk_phys_t * lb)11354 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
11355 l2arc_log_blk_phys_t *lb)
11356 {
11357 uint32_t asize;
11358 zio_t *pio;
11359 l2arc_read_callback_t *cb;
11360
11361 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
11362 asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
11363 ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
11364
11365 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
11366 cb->l2rcb_abd = abd_get_from_buf(lb, asize);
11367 pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
11368 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
11369 (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
11370 cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
11371 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
11372 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
11373
11374 return (pio);
11375 }
11376
11377 /*
11378 * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
11379 * buffers allocated for it.
11380 */
11381 static void
l2arc_log_blk_fetch_abort(zio_t * zio)11382 l2arc_log_blk_fetch_abort(zio_t *zio)
11383 {
11384 (void) zio_wait(zio);
11385 }
11386
11387 /*
11388 * Creates a zio to update the device header on an l2arc device.
11389 */
11390 void
l2arc_dev_hdr_update(l2arc_dev_t * dev)11391 l2arc_dev_hdr_update(l2arc_dev_t *dev)
11392 {
11393 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11394 const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
11395 abd_t *abd;
11396 int err;
11397
11398 VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
11399
11400 l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
11401 l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
11402 l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
11403 l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
11404 l2dhdr->dh_log_entries = dev->l2ad_log_entries;
11405 l2dhdr->dh_evict = dev->l2ad_evict;
11406 l2dhdr->dh_start = dev->l2ad_start;
11407 l2dhdr->dh_end = dev->l2ad_end;
11408 l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
11409 l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
11410 l2dhdr->dh_flags = 0;
11411 l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
11412 l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
11413 if (dev->l2ad_first)
11414 l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
11415
11416 abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
11417
11418 err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
11419 VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
11420 NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
11421
11422 abd_free(abd);
11423
11424 if (err != 0) {
11425 zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
11426 "vdev guid: %llu", err,
11427 (u_longlong_t)dev->l2ad_vdev->vdev_guid);
11428 }
11429 }
11430
11431 /*
11432 * Commits a log block to the L2ARC device. This routine is invoked from
11433 * l2arc_write_buffers when the log block fills up.
11434 * This function allocates some memory to temporarily hold the serialized
11435 * buffer to be written. This is then released in l2arc_write_done.
11436 */
11437 static uint64_t
l2arc_log_blk_commit(l2arc_dev_t * dev,zio_t * pio,l2arc_write_callback_t * cb)11438 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
11439 {
11440 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
11441 l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
11442 uint64_t psize, asize;
11443 zio_t *wzio;
11444 l2arc_lb_abd_buf_t *abd_buf;
11445 abd_t *abd = NULL;
11446 l2arc_lb_ptr_buf_t *lb_ptr_buf;
11447
11448 VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
11449
11450 abd_buf = zio_buf_alloc(sizeof (*abd_buf));
11451 abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
11452 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
11453 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
11454
11455 /* link the buffer into the block chain */
11456 lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
11457 lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
11458
11459 /*
11460 * l2arc_log_blk_commit() may be called multiple times during a single
11461 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
11462 * so we can free them in l2arc_write_done() later on.
11463 */
11464 list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
11465
11466 /* try to compress the buffer, at least one sector to save */
11467 psize = zio_compress_data(ZIO_COMPRESS_LZ4,
11468 abd_buf->abd, &abd, sizeof (*lb),
11469 zio_get_compression_max_size(ZIO_COMPRESS_LZ4,
11470 dev->l2ad_vdev->vdev_ashift,
11471 dev->l2ad_vdev->vdev_ashift, sizeof (*lb)), 0);
11472
11473 /* a log block is never entirely zero */
11474 ASSERT(psize != 0);
11475 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
11476 ASSERT(asize <= sizeof (*lb));
11477
11478 /*
11479 * Update the start log block pointer in the device header to point
11480 * to the log block we're about to write.
11481 */
11482 l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
11483 l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
11484 l2dhdr->dh_start_lbps[0].lbp_payload_asize =
11485 dev->l2ad_log_blk_payload_asize;
11486 l2dhdr->dh_start_lbps[0].lbp_payload_start =
11487 dev->l2ad_log_blk_payload_start;
11488 L2BLK_SET_LSIZE(
11489 (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
11490 L2BLK_SET_PSIZE(
11491 (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
11492 L2BLK_SET_CHECKSUM(
11493 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11494 ZIO_CHECKSUM_FLETCHER_4);
11495 if (asize < sizeof (*lb)) {
11496 /* compression succeeded */
11497 abd_zero_off(abd, psize, asize - psize);
11498 L2BLK_SET_COMPRESS(
11499 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11500 ZIO_COMPRESS_LZ4);
11501 } else {
11502 /* compression failed */
11503 abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
11504 L2BLK_SET_COMPRESS(
11505 (&l2dhdr->dh_start_lbps[0])->lbp_prop,
11506 ZIO_COMPRESS_OFF);
11507 }
11508
11509 /* checksum what we're about to write */
11510 abd_fletcher_4_native(abd, asize, NULL,
11511 &l2dhdr->dh_start_lbps[0].lbp_cksum);
11512
11513 abd_free(abd_buf->abd);
11514
11515 /* perform the write itself */
11516 abd_buf->abd = abd;
11517 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
11518 asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
11519 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
11520 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
11521 (void) zio_nowait(wzio);
11522
11523 dev->l2ad_hand += asize;
11524 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
11525
11526 /*
11527 * Include the committed log block's pointer in the list of pointers
11528 * to log blocks present in the L2ARC device.
11529 */
11530 memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
11531 sizeof (l2arc_log_blkptr_t));
11532 mutex_enter(&dev->l2ad_mtx);
11533 list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
11534 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
11535 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
11536 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
11537 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
11538 mutex_exit(&dev->l2ad_mtx);
11539
11540 /* bump the kstats */
11541 ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
11542 ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
11543 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
11544 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
11545 dev->l2ad_log_blk_payload_asize / asize);
11546
11547 /* start a new log block */
11548 dev->l2ad_log_ent_idx = 0;
11549 dev->l2ad_log_blk_payload_asize = 0;
11550 dev->l2ad_log_blk_payload_start = 0;
11551
11552 return (asize);
11553 }
11554
11555 /*
11556 * Validates an L2ARC log block address to make sure that it can be read
11557 * from the provided L2ARC device.
11558 */
11559 boolean_t
l2arc_log_blkptr_valid(l2arc_dev_t * dev,const l2arc_log_blkptr_t * lbp)11560 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
11561 {
11562 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
11563 uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
11564 uint64_t end = lbp->lbp_daddr + asize - 1;
11565 uint64_t start = lbp->lbp_payload_start;
11566 boolean_t evicted = B_FALSE;
11567
11568 /*
11569 * A log block is valid if all of the following conditions are true:
11570 * - it fits entirely (including its payload) between l2ad_start and
11571 * l2ad_end
11572 * - it has a valid size
11573 * - neither the log block itself nor part of its payload was evicted
11574 * by l2arc_evict():
11575 *
11576 * l2ad_hand l2ad_evict
11577 * | | lbp_daddr
11578 * | start | | end
11579 * | | | | |
11580 * V V V V V
11581 * l2ad_start ============================================ l2ad_end
11582 * --------------------------||||
11583 * ^ ^
11584 * | log block
11585 * payload
11586 */
11587
11588 evicted =
11589 l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
11590 l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
11591 l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
11592 l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
11593
11594 return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
11595 asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
11596 (!evicted || dev->l2ad_first));
11597 }
11598
11599 /*
11600 * Inserts ARC buffer header `hdr' into the current L2ARC log block on
11601 * the device. The buffer being inserted must be present in L2ARC.
11602 * Returns B_TRUE if the L2ARC log block is full and needs to be committed
11603 * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
11604 */
11605 static boolean_t
l2arc_log_blk_insert(l2arc_dev_t * dev,const arc_buf_hdr_t * hdr)11606 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
11607 {
11608 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
11609 l2arc_log_ent_phys_t *le;
11610
11611 if (dev->l2ad_log_entries == 0)
11612 return (B_FALSE);
11613
11614 int index = dev->l2ad_log_ent_idx++;
11615
11616 ASSERT3S(index, <, dev->l2ad_log_entries);
11617 ASSERT(HDR_HAS_L2HDR(hdr));
11618
11619 le = &lb->lb_entries[index];
11620 memset(le, 0, sizeof (*le));
11621 le->le_dva = hdr->b_dva;
11622 le->le_birth = hdr->b_birth;
11623 le->le_daddr = hdr->b_l2hdr.b_daddr;
11624 if (index == 0)
11625 dev->l2ad_log_blk_payload_start = le->le_daddr;
11626 L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
11627 L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
11628 L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
11629 le->le_complevel = hdr->b_complevel;
11630 L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
11631 L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
11632 L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
11633 L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
11634
11635 dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
11636 HDR_GET_PSIZE(hdr));
11637
11638 return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
11639 }
11640
11641 /*
11642 * Checks whether a given L2ARC device address sits in a time-sequential
11643 * range. The trick here is that the L2ARC is a rotary buffer, so we can't
11644 * just do a range comparison, we need to handle the situation in which the
11645 * range wraps around the end of the L2ARC device. Arguments:
11646 * bottom -- Lower end of the range to check (written to earlier).
11647 * top -- Upper end of the range to check (written to later).
11648 * check -- The address for which we want to determine if it sits in
11649 * between the top and bottom.
11650 *
11651 * The 3-way conditional below represents the following cases:
11652 *
11653 * bottom < top : Sequentially ordered case:
11654 * <check>--------+-------------------+
11655 * | (overlap here?) |
11656 * L2ARC dev V V
11657 * |---------------<bottom>============<top>--------------|
11658 *
11659 * bottom > top: Looped-around case:
11660 * <check>--------+------------------+
11661 * | (overlap here?) |
11662 * L2ARC dev V V
11663 * |===============<top>---------------<bottom>===========|
11664 * ^ ^
11665 * | (or here?) |
11666 * +---------------+---------<check>
11667 *
11668 * top == bottom : Just a single address comparison.
11669 */
11670 boolean_t
l2arc_range_check_overlap(uint64_t bottom,uint64_t top,uint64_t check)11671 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
11672 {
11673 if (bottom < top)
11674 return (bottom <= check && check <= top);
11675 else if (bottom > top)
11676 return (check <= top || bottom <= check);
11677 else
11678 return (check == top);
11679 }
11680
11681 EXPORT_SYMBOL(arc_buf_size);
11682 EXPORT_SYMBOL(arc_write);
11683 EXPORT_SYMBOL(arc_read);
11684 EXPORT_SYMBOL(arc_buf_info);
11685 EXPORT_SYMBOL(arc_getbuf_func);
11686 EXPORT_SYMBOL(arc_add_prune_callback);
11687 EXPORT_SYMBOL(arc_remove_prune_callback);
11688
11689 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
11690 spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
11691
11692 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
11693 spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
11694
11695 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
11696 "Balance between metadata and data on ghost hits.");
11697
11698 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
11699 param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
11700
11701 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
11702 param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
11703
11704 #ifdef _KERNEL
11705 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
11706 "Percent of pagecache to reclaim ARC to");
11707 #endif
11708
11709 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
11710 "Target average block size");
11711
11712 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
11713 "Disable compressed ARC buffers");
11714
11715 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
11716 param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
11717
11718 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
11719 param_set_arc_int, param_get_uint, ZMOD_RW,
11720 "Min life of prescient prefetched block in ms");
11721
11722 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
11723 "Max write bytes per interval");
11724
11725 ZFS_MODULE_PARAM_CALL(zfs_l2arc, l2arc_, dwpd_limit, param_set_l2arc_dwpd_limit,
11726 spl_param_get_u64, ZMOD_RW,
11727 "L2ARC device endurance limit as percentage (100 = 1.0 DWPD)");
11728
11729 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
11730 "Number of max device writes to precache");
11731
11732 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
11733 "Compressed l2arc_headroom multiplier");
11734
11735 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
11736 "TRIM ahead L2ARC write size multiplier");
11737
11738 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
11739 "Seconds between L2ARC writing");
11740
11741 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
11742 "Min feed interval in milliseconds");
11743
11744 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
11745 "Skip caching prefetched buffers");
11746
11747 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
11748 "Turbo L2ARC warmup");
11749
11750 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
11751 "No reads during writes");
11752
11753 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
11754 "Percent of ARC size allowed for L2ARC-only headers");
11755
11756 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
11757 "Rebuild the L2ARC when importing a pool");
11758
11759 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
11760 "Min size in bytes to write rebuild log blocks in L2ARC");
11761
11762 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
11763 "Cache only MFU data from ARC into L2ARC");
11764
11765 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
11766 "Exclude dbufs on special vdevs from being cached to L2ARC if set.");
11767
11768 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_cycles, U64, ZMOD_RW,
11769 "Consecutive metadata cycles before skipping to let data run");
11770
11771 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, ext_headroom_pct, U64, ZMOD_RW,
11772 "Depth cap as percentage of state size for marker reset");
11773
11774 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
11775 param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
11776
11777 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
11778 spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
11779
11780 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
11781 spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
11782
11783 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
11784 param_set_arc_int, param_get_uint, ZMOD_RW,
11785 "Percent of ARC meta buffers for dnodes");
11786
11787 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
11788 "Percentage of excess dnodes to try to unpin");
11789
11790 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
11791 "When full, ARC allocation waits for eviction of this % of alloc size");
11792
11793 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
11794 "The number of headers to evict per sublist before moving to the next");
11795
11796 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batches_limit, UINT, ZMOD_RW,
11797 "The number of batches to run per parallel eviction task");
11798
11799 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
11800 "Number of arc_prune threads");
11801
11802 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
11803 "Number of threads to use for ARC eviction.");
11804