xref: /src/sys/contrib/openzfs/module/zfs/ddt_log.c (revision 8a62a2a5659d1839d8799b4274c04469d7f17c78) !
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2023, Klara Inc.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/ddt.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dmu.h>
32 #include <sys/ddt_impl.h>
33 #include <sys/dnode.h>
34 #include <sys/dbuf.h>
35 #include <sys/zap.h>
36 #include <sys/zio_checksum.h>
37 
38 /*
39  * No more than this many txgs before swapping logs.
40  */
41 uint_t zfs_dedup_log_txg_max = 8;
42 
43 /*
44  * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
45  * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
46  */
47 uint64_t zfs_dedup_log_mem_max = 0;
48 uint_t zfs_dedup_log_mem_max_percent = 1;
49 
50 
51 static kmem_cache_t *ddt_log_entry_flat_cache;
52 static kmem_cache_t *ddt_log_entry_trad_cache;
53 
54 #define	DDT_LOG_ENTRY_FLAT_SIZE	\
55 	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
56 #define	DDT_LOG_ENTRY_TRAD_SIZE	\
57 	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
58 
59 #define	DDT_LOG_ENTRY_SIZE(ddt)	\
60 	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
61 
62 void
ddt_log_init(void)63 ddt_log_init(void)
64 {
65 	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
66 	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
67 	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
68 	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
69 
70 	/*
71 	 * Max memory for log AVL entries. At least 1M, because we need
72 	 * something (that's ~3800 entries per tree). They can say 100% if they
73 	 * want; it just means they're at the mercy of the the txg flush limit.
74 	 */
75 	if (zfs_dedup_log_mem_max == 0) {
76 		zfs_dedup_log_mem_max_percent =
77 		    MIN(zfs_dedup_log_mem_max_percent, 100);
78 		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
79 		    zfs_dedup_log_mem_max_percent / 100;
80 	}
81 	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
82 }
83 
84 void
ddt_log_fini(void)85 ddt_log_fini(void)
86 {
87 	kmem_cache_destroy(ddt_log_entry_trad_cache);
88 	kmem_cache_destroy(ddt_log_entry_flat_cache);
89 }
90 
91 static void
ddt_log_name(ddt_t * ddt,char * name,uint_t n)92 ddt_log_name(ddt_t *ddt, char *name, uint_t n)
93 {
94 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
95 	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
96 }
97 
98 static void
ddt_log_update_header(ddt_t * ddt,ddt_log_t * ddl,dmu_tx_t * tx)99 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
100 {
101 	dmu_buf_t *db;
102 	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
103 	dmu_buf_will_dirty(db, tx);
104 
105 	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
106 	DLH_SET_VERSION(hdr, 1);
107 	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
108 	hdr->dlh_length = ddl->ddl_length;
109 	hdr->dlh_first_txg = ddl->ddl_first_txg;
110 	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
111 
112 	dmu_buf_rele(db, FTAG);
113 }
114 
115 static void
ddt_log_create_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)116 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
117 {
118 	ASSERT3U(ddt->ddt_dir_object, >, 0);
119 	ASSERT0(ddl->ddl_object);
120 
121 	char name[DDT_NAMELEN];
122 	ddt_log_name(ddt, name, n);
123 
124 	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
125 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
126 	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
127 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
128 	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
129 	ddl->ddl_length = 0;
130 	ddl->ddl_first_txg = tx->tx_txg;
131 	ddt_log_update_header(ddt, ddl, tx);
132 }
133 
134 static void
ddt_log_create(ddt_t * ddt,dmu_tx_t * tx)135 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
136 {
137 	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
138 	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
139 }
140 
141 static void
ddt_log_destroy_one(ddt_t * ddt,ddt_log_t * ddl,uint_t n,dmu_tx_t * tx)142 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
143 {
144 	ASSERT3U(ddt->ddt_dir_object, >, 0);
145 
146 	if (ddl->ddl_object == 0)
147 		return;
148 
149 	ASSERT0(ddl->ddl_length);
150 
151 	char name[DDT_NAMELEN];
152 	ddt_log_name(ddt, name, n);
153 
154 	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
155 	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
156 
157 	ddl->ddl_object = 0;
158 }
159 
160 void
ddt_log_destroy(ddt_t * ddt,dmu_tx_t * tx)161 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
162 {
163 	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
164 	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
165 }
166 
167 static void
ddt_log_update_stats(ddt_t * ddt)168 ddt_log_update_stats(ddt_t *ddt)
169 {
170 	/*
171 	 * Log object stats. We count the number of live entries in the log
172 	 * tree, even if there are more than on disk, and even if the same
173 	 * entry is on both append and flush trees, because that's more what
174 	 * the user expects to see. This does mean the on-disk size is not
175 	 * really correlated with the number of entries, but I don't think
176 	 * that's reasonable to expect anyway.
177 	 */
178 	dmu_object_info_t doi;
179 	uint64_t nblocks = 0;
180 	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
181 	    &doi) == 0)
182 		nblocks += doi.doi_physical_blocks_512;
183 	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
184 	    &doi) == 0)
185 		nblocks += doi.doi_physical_blocks_512;
186 
187 	ddt_object_t *ddo = &ddt->ddt_log_stats;
188 	ddo->ddo_count =
189 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
190 	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
191 	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
192 	ddo->ddo_dspace = nblocks << 9;
193 }
194 
195 void
ddt_log_begin(ddt_t * ddt,size_t nentries,dmu_tx_t * tx,ddt_log_update_t * dlu)196 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
197 {
198 	ASSERT3U(nentries, >, 0);
199 	ASSERT0P(dlu->dlu_dbp);
200 
201 	if (ddt->ddt_log_active->ddl_object == 0)
202 		ddt_log_create(ddt, tx);
203 
204 	/*
205 	 * We want to store as many entries as we can in a block, but never
206 	 * split an entry across block boundaries.
207 	 */
208 	size_t reclen = P2ALIGN_TYPED(
209 	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
210 	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
211 	ASSERT3U(reclen, <=, UINT16_MAX);
212 	dlu->dlu_reclen = reclen;
213 
214 	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
215 	    &dlu->dlu_dn));
216 	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
217 
218 	uint64_t nblocks = howmany(nentries,
219 	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
220 	uint64_t offset = ddt->ddt_log_active->ddl_length;
221 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
222 
223 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
224 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
225 	    DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO));
226 
227 	dlu->dlu_tx = tx;
228 	dlu->dlu_block = dlu->dlu_offset = 0;
229 }
230 
231 static ddt_log_entry_t *
ddt_log_alloc_entry(ddt_t * ddt)232 ddt_log_alloc_entry(ddt_t *ddt)
233 {
234 	ddt_log_entry_t *ddle;
235 
236 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
237 		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
238 		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
239 	} else {
240 		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
241 		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
242 	}
243 
244 	return (ddle);
245 }
246 
247 static void
ddt_log_free_entry(ddt_t * ddt,ddt_log_entry_t * ddle)248 ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
249 {
250 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
251 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
252 }
253 
254 static void
ddt_log_update_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe,boolean_t hist)255 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe,
256     boolean_t hist)
257 {
258 	/* Create the log tree entry from a live or stored entry */
259 	avl_index_t where;
260 	ddt_log_entry_t *ddle =
261 	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
262 	if (ddle == NULL) {
263 		ddle = ddt_log_alloc_entry(ddt);
264 		ddle->ddle_key = ddlwe->ddlwe_key;
265 		avl_insert(&ddl->ddl_tree, ddle, where);
266 	} else if (hist) {
267 		ddt_lightweight_entry_t oddlwe;
268 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &oddlwe);
269 		ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &oddlwe);
270 	}
271 	if (hist)
272 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
273 	ddle->ddle_type = ddlwe->ddlwe_type;
274 	ddle->ddle_class = ddlwe->ddlwe_class;
275 	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
276 }
277 
278 void
ddt_log_entry(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,ddt_log_update_t * dlu)279 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
280 {
281 	ASSERT3P(dlu->dlu_dbp, !=, NULL);
282 
283 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe, B_TRUE);
284 
285 	/* Get our block */
286 	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
287 	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
288 
289 	/*
290 	 * If this would take us past the end of the block, finish it and
291 	 * move to the next one.
292 	 */
293 	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
294 		ASSERT3U(dlu->dlu_offset, >, 0);
295 		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
296 		dlu->dlu_block++;
297 		dlu->dlu_offset = 0;
298 		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
299 		db = dlu->dlu_dbp[dlu->dlu_block];
300 	}
301 
302 	/*
303 	 * If this is the first time touching the block, inform the DMU that
304 	 * we will fill it, and zero it out.
305 	 */
306 	if (dlu->dlu_offset == 0) {
307 		dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE,
308 		    DMU_UNCACHEDIO);
309 		memset(db->db_data, 0, db->db_size);
310 	}
311 
312 	/* Create the log record directly in the buffer */
313 	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
314 	DLR_SET_TYPE(dlr, DLR_ENTRY);
315 	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
316 	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
317 	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
318 
319 	ddt_log_record_entry_t *dlre =
320 	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
321 	dlre->dlre_key = ddlwe->ddlwe_key;
322 	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
323 
324 	/* Advance offset for next record. */
325 	dlu->dlu_offset += dlu->dlu_reclen;
326 }
327 
328 void
ddt_log_commit(ddt_t * ddt,ddt_log_update_t * dlu)329 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
330 {
331 	ASSERT3P(dlu->dlu_dbp, !=, NULL);
332 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
333 	ASSERT3U(dlu->dlu_offset, >, 0);
334 
335 	/*
336 	 * Close out the last block. Whatever we haven't used will be zeroed,
337 	 * which matches DLR_INVALID, so we can detect this during load.
338 	 */
339 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
340 
341 	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
342 
343 	ddt->ddt_log_active->ddl_length +=
344 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
345 	dnode_rele(dlu->dlu_dn, FTAG);
346 
347 	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
348 
349 	memset(dlu, 0, sizeof (ddt_log_update_t));
350 
351 	ddt_log_update_stats(ddt);
352 }
353 
354 boolean_t
ddt_log_take_first(ddt_t * ddt,ddt_log_t * ddl,ddt_lightweight_entry_t * ddlwe)355 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
356 {
357 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
358 	if (ddle == NULL)
359 		return (B_FALSE);
360 
361 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
362 
363 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
364 
365 	avl_remove(&ddl->ddl_tree, ddle);
366 	ddt_log_free_entry(ddt, ddle);
367 
368 	return (B_TRUE);
369 }
370 
371 boolean_t
ddt_log_remove_key(ddt_t * ddt,ddt_log_t * ddl,const ddt_key_t * ddk)372 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
373 {
374 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
375 	if (ddle == NULL)
376 		return (B_FALSE);
377 
378 	ddt_lightweight_entry_t ddlwe;
379 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
380 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
381 
382 	avl_remove(&ddl->ddl_tree, ddle);
383 	ddt_log_free_entry(ddt, ddle);
384 
385 	return (B_TRUE);
386 }
387 
388 boolean_t
ddt_log_find_key(ddt_t * ddt,const ddt_key_t * ddk,ddt_lightweight_entry_t * ddlwe,boolean_t * from_flushing)389 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
390     ddt_lightweight_entry_t *ddlwe, boolean_t *from_flushing)
391 {
392 	ddt_log_entry_t *ddle = avl_find(&ddt->ddt_log_active->ddl_tree,
393 	    ddk, NULL);
394 	if (ddle) {
395 		if (from_flushing)
396 			*from_flushing = B_FALSE;
397 	} else {
398 		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
399 		if (!ddle)
400 			return (B_FALSE);
401 		if (from_flushing)
402 			*from_flushing = B_TRUE;
403 	}
404 	if (ddlwe)
405 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
406 	return (B_TRUE);
407 }
408 
409 void
ddt_log_checkpoint(ddt_t * ddt,ddt_lightweight_entry_t * ddlwe,dmu_tx_t * tx)410 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
411 {
412 	ddt_log_t *ddl = ddt->ddt_log_flushing;
413 
414 	ASSERT3U(ddl->ddl_object, !=, 0);
415 
416 #ifdef ZFS_DEBUG
417 	/*
418 	 * There should not be any entries on the log tree before the given
419 	 * checkpoint. Assert that this is the case.
420 	 */
421 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
422 	if (ddle != NULL)
423 		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
424 		    >, 0);
425 #endif
426 
427 	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
428 	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
429 	ddt_log_update_header(ddt, ddl, tx);
430 
431 	ddt_log_update_stats(ddt);
432 }
433 
434 void
ddt_log_truncate(ddt_t * ddt,dmu_tx_t * tx)435 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
436 {
437 	ddt_log_t *ddl = ddt->ddt_log_flushing;
438 
439 	if (ddl->ddl_object == 0)
440 		return;
441 
442 	ASSERT(avl_is_empty(&ddl->ddl_tree));
443 
444 	/* Eject the entire object */
445 	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
446 
447 	ddl->ddl_length = 0;
448 	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
449 	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
450 	ddt_log_update_header(ddt, ddl, tx);
451 
452 	ddt_log_update_stats(ddt);
453 }
454 
455 boolean_t
ddt_log_swap(ddt_t * ddt,dmu_tx_t * tx)456 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
457 {
458 	/* Swap the logs. The old flushing one must be empty */
459 	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
460 
461 	/*
462 	 * If there are still blocks on the flushing log, truncate it first.
463 	 * This can happen if there were entries on the flushing log that were
464 	 * removed in memory via ddt_lookup(); their vestigal remains are
465 	 * on disk.
466 	 */
467 	if (ddt->ddt_log_flushing->ddl_length > 0)
468 		ddt_log_truncate(ddt, tx);
469 
470 	/*
471 	 * Swap policy. We swap the logs (and so begin flushing) when the
472 	 * active tree grows too large, or when we haven't swapped it in
473 	 * some amount of time, or if something has requested the logs be
474 	 * flushed ASAP (see ddt_walk_init()).
475 	 */
476 
477 	/*
478 	 * The log tree is too large if the memory usage of its entries is over
479 	 * half of the memory limit. This effectively gives each log tree half
480 	 * the available memory.
481 	 */
482 	const boolean_t too_large =
483 	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
484 	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
485 
486 	const boolean_t too_old =
487 	    tx->tx_txg >=
488 	    (ddt->ddt_log_active->ddl_first_txg +
489 	    MAX(1, zfs_dedup_log_txg_max));
490 
491 	const boolean_t force =
492 	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
493 
494 	if (!(too_large || too_old || force))
495 		return (B_FALSE);
496 
497 	ddt_log_t *swap = ddt->ddt_log_active;
498 	ddt->ddt_log_active = ddt->ddt_log_flushing;
499 	ddt->ddt_log_flushing = swap;
500 
501 	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
502 	ddt->ddt_log_active->ddl_flags &=
503 	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
504 
505 	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
506 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
507 
508 	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
509 
510 	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
511 	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
512 
513 	ddt_log_update_stats(ddt);
514 
515 	return (B_TRUE);
516 }
517 
518 static inline void
ddt_log_load_entry(ddt_t * ddt,ddt_log_t * ddl,ddt_log_record_t * dlr,const ddt_key_t * checkpoint)519 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
520     const ddt_key_t *checkpoint)
521 {
522 	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
523 
524 	ddt_log_record_entry_t *dlre =
525 	    (ddt_log_record_entry_t *)dlr->dlr_payload;
526 	if (checkpoint != NULL &&
527 	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
528 		/* Skip pre-checkpoint entries; they're already flushed. */
529 		return;
530 	}
531 
532 	ddt_lightweight_entry_t ddlwe;
533 	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
534 	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
535 
536 	ddlwe.ddlwe_key = dlre->dlre_key;
537 	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
538 
539 	ddt_log_update_entry(ddt, ddl, &ddlwe, B_FALSE);
540 }
541 
542 static void
ddt_log_empty(ddt_t * ddt,ddt_log_t * ddl)543 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
544 {
545 	void *cookie = NULL;
546 	ddt_log_entry_t *ddle;
547 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
548 	while ((ddle =
549 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
550 		ddt_log_free_entry(ddt, ddle);
551 	}
552 	ASSERT(avl_is_empty(&ddl->ddl_tree));
553 }
554 
555 static int
ddt_log_load_one(ddt_t * ddt,uint_t n)556 ddt_log_load_one(ddt_t *ddt, uint_t n)
557 {
558 	ASSERT3U(n, <, 2);
559 
560 	ddt_log_t *ddl = &ddt->ddt_log[n];
561 
562 	char name[DDT_NAMELEN];
563 	ddt_log_name(ddt, name, n);
564 
565 	uint64_t obj;
566 	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
567 	    sizeof (uint64_t), 1, &obj);
568 	if (err == ENOENT)
569 		return (0);
570 	if (err != 0)
571 		return (err);
572 
573 	dnode_t *dn;
574 	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
575 	if (err != 0)
576 		return (err);
577 
578 	ddt_log_header_t hdr;
579 	dmu_buf_t *db;
580 	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
581 	if (err != 0) {
582 		dnode_rele(dn, FTAG);
583 		return (err);
584 	}
585 	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
586 	dmu_buf_rele(db, FTAG);
587 
588 	if (DLH_GET_VERSION(&hdr) != 1) {
589 		dnode_rele(dn, FTAG);
590 		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
591 		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
592 		    (u_longlong_t)DLH_GET_VERSION(&hdr));
593 		return (SET_ERROR(EINVAL));
594 	}
595 
596 	ddt_key_t *checkpoint = NULL;
597 	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
598 		/*
599 		 * If the log has a checkpoint, then we can ignore any entries
600 		 * that have already been flushed.
601 		 */
602 		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
603 		checkpoint = &hdr.dlh_checkpoint;
604 	}
605 
606 	if (hdr.dlh_length > 0) {
607 		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
608 		    ZIO_PRIORITY_SYNC_READ);
609 
610 		for (uint64_t offset = 0; offset < hdr.dlh_length;
611 		    offset += dn->dn_datablksz) {
612 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
613 			    DMU_READ_PREFETCH | DMU_UNCACHEDIO);
614 			if (err != 0) {
615 				dnode_rele(dn, FTAG);
616 				ddt_log_empty(ddt, ddl);
617 				return (err);
618 			}
619 
620 			uint64_t boffset = 0;
621 			while (boffset < db->db_size) {
622 				ddt_log_record_t *dlr =
623 				    (ddt_log_record_t *)(db->db_data + boffset);
624 
625 				/* Partially-filled block, skip the rest */
626 				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
627 					break;
628 
629 				switch (DLR_GET_TYPE(dlr)) {
630 				case DLR_ENTRY:
631 					ddt_log_load_entry(ddt, ddl, dlr,
632 					    checkpoint);
633 					break;
634 
635 				default:
636 					dmu_buf_rele(db, FTAG);
637 					dnode_rele(dn, FTAG);
638 					ddt_log_empty(ddt, ddl);
639 					return (SET_ERROR(EINVAL));
640 				}
641 
642 				boffset += DLR_GET_RECLEN(dlr);
643 			}
644 
645 			dmu_buf_rele(db, FTAG);
646 		}
647 	}
648 
649 	dnode_rele(dn, FTAG);
650 
651 	ddl->ddl_object = obj;
652 	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
653 	ddl->ddl_length = hdr.dlh_length;
654 	ddl->ddl_first_txg = hdr.dlh_first_txg;
655 
656 	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
657 		ddt->ddt_log_flushing = ddl;
658 	else
659 		ddt->ddt_log_active = ddl;
660 
661 	return (0);
662 }
663 
664 int
ddt_log_load(ddt_t * ddt)665 ddt_log_load(ddt_t *ddt)
666 {
667 	int err;
668 
669 	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
670 		/*
671 		 * The DDT is going to be freed again in a moment, so there's
672 		 * no point loading the log; it'll just slow down import.
673 		 */
674 		return (0);
675 	}
676 
677 	ASSERT0(ddt->ddt_log[0].ddl_object);
678 	ASSERT0(ddt->ddt_log[1].ddl_object);
679 	if (ddt->ddt_dir_object == 0) {
680 		/*
681 		 * If we're configured but the containing dir doesn't exist
682 		 * yet, then the log object can't possibly exist either.
683 		 */
684 		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
685 		return (SET_ERROR(ENOENT));
686 	}
687 
688 	if ((err = ddt_log_load_one(ddt, 0)) != 0)
689 		return (err);
690 	if ((err = ddt_log_load_one(ddt, 1)) != 0)
691 		return (err);
692 
693 	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
694 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
695 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
696 	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
697 
698 	/*
699 	 * We have two finalisation tasks:
700 	 *
701 	 * - rebuild the histogram. We do this at the end rather than while
702 	 *   we're loading so we don't need to uncount and recount entries that
703 	 *   appear multiple times in the log.
704 	 *
705 	 * - remove entries from the flushing tree that are on both trees. This
706 	 *   happens when ddt_lookup() rehydrates an entry from the flushing
707 	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
708 	 *   tree but doesn't remove it from disk.
709 	 */
710 
711 	/*
712 	 * We don't technically need a config lock here, since there shouldn't
713 	 * be pool config changes during DDT load. dva_get_dsize_sync() via
714 	 * ddt_stat_generate() is expecting it though, and it won't hurt
715 	 * anything, so we take it.
716 	 */
717 	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
718 
719 	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
720 	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
721 	ddt_log_entry_t *ae = avl_first(al);
722 	ddt_log_entry_t *fe = avl_first(fl);
723 	while (ae != NULL || fe != NULL) {
724 		ddt_log_entry_t *ddle;
725 		if (ae == NULL) {
726 			/* active exhausted, take flushing */
727 			ddle = fe;
728 			fe = AVL_NEXT(fl, fe);
729 		} else if (fe == NULL) {
730 			/* flushing exuhausted, take active */
731 			ddle = ae;
732 			ae = AVL_NEXT(al, ae);
733 		} else {
734 			/* compare active and flushing */
735 			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
736 			if (c < 0) {
737 				/* active behind, take and advance */
738 				ddle = ae;
739 				ae = AVL_NEXT(al, ae);
740 			} else if (c > 0) {
741 				/* flushing behind, take and advance */
742 				ddle = fe;
743 				fe = AVL_NEXT(fl, fe);
744 			} else {
745 				/* match. remove from flushing, take active */
746 				ddle = fe;
747 				fe = AVL_NEXT(fl, fe);
748 				avl_remove(fl, ddle);
749 				ddt_log_free_entry(ddt, ddle);
750 				ddle = ae;
751 				ae = AVL_NEXT(al, ae);
752 			}
753 		}
754 
755 		ddt_lightweight_entry_t ddlwe;
756 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
757 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
758 	}
759 
760 	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
761 
762 	ddt_log_update_stats(ddt);
763 
764 	return (0);
765 }
766 
767 void
ddt_log_alloc(ddt_t * ddt)768 ddt_log_alloc(ddt_t *ddt)
769 {
770 	ASSERT0P(ddt->ddt_log_active);
771 	ASSERT0P(ddt->ddt_log_flushing);
772 
773 	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
774 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
775 	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
776 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
777 	ddt->ddt_log_active = &ddt->ddt_log[0];
778 	ddt->ddt_log_flushing = &ddt->ddt_log[1];
779 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
780 }
781 
782 void
ddt_log_free(ddt_t * ddt)783 ddt_log_free(ddt_t *ddt)
784 {
785 	ddt_log_empty(ddt, &ddt->ddt_log[0]);
786 	ddt_log_empty(ddt, &ddt->ddt_log[1]);
787 	avl_destroy(&ddt->ddt_log[0].ddl_tree);
788 	avl_destroy(&ddt->ddt_log[1].ddl_tree);
789 }
790 
791 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
792 	"Max transactions before starting to flush dedup logs");
793 
794 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
795 	"Max memory for dedup logs");
796 
797 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
798 	"Max memory for dedup logs, as % of total memory");
799