xref: /src/sys/contrib/openzfs/include/sys/ddt.h (revision 8a62a2a5659d1839d8799b4274c04469d7f17c78) !
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  * Copyright (c) 2023, Klara Inc.
26  */
27 
28 #ifndef _SYS_DDT_H
29 #define	_SYS_DDT_H
30 
31 #include <sys/sysmacros.h>
32 #include <sys/types.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/zio.h>
35 #include <sys/dmu.h>
36 #include <sys/wmsum.h>
37 
38 #ifdef	__cplusplus
39 extern "C" {
40 #endif
41 
42 struct abd;
43 
44 /*
45  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
46  */
47 #define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
48 #define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
49 #define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
50 
51 /*
52  * DDT on-disk storage object types. Each one corresponds to specific
53  * implementation, see ddt_ops_t. The value itself is not stored on disk.
54  *
55  * When searching for an entry, objects types will be searched in this order.
56  *
57  * Note that DDT_TYPES is used as the "no type" for new entries that have not
58  * yet been written to a storage object.
59  */
60 typedef enum {
61 	DDT_TYPE_ZAP = 0,	/* ZAP storage object, ddt_zap */
62 	DDT_TYPES
63 } ddt_type_t;
64 
65 _Static_assert(DDT_TYPES <= UINT8_MAX,
66 	"ddt_type_t must fit in a uint8_t");
67 
68 /* New and updated entries recieve this type, see ddt_sync_entry() */
69 #define	DDT_TYPE_DEFAULT	(DDT_TYPE_ZAP)
70 
71 /*
72  * DDT storage classes. Each class has a separate storage object for each type.
73  * The value itself is not stored on disk.
74  *
75  * When search for an entry, object classes will be searched in this order.
76  *
77  * Note that DDT_CLASSES is used as the "no class" for new entries that have not
78  * yet been written to a storage object.
79  */
80 typedef enum {
81 	DDT_CLASS_DITTO = 0,	/* entry has ditto blocks (obsolete) */
82 	DDT_CLASS_DUPLICATE,	/* entry has multiple references */
83 	DDT_CLASS_UNIQUE,	/* entry has a single reference */
84 	DDT_CLASSES
85 } ddt_class_t;
86 
87 _Static_assert(DDT_CLASSES < UINT8_MAX,
88 	"ddt_class_t must fit in a uint8_t");
89 
90 /*
91  * The "key" part of an on-disk entry. This is the unique "name" for a block,
92  * that is, that parts of the block pointer that will always be the same for
93  * the same data.
94  */
95 typedef struct {
96 	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
97 	/*
98 	 * Encoded with logical & physical size, encryption, and compression,
99 	 * as follows:
100 	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
101 	 *   |   0   |   0   |   0   |X| comp|     PSIZE     |     LSIZE     |
102 	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
103 	 */
104 	uint64_t	ddk_prop;
105 } ddt_key_t;
106 
107 /*
108  * Macros for accessing parts of a ddt_key_t. These are similar to their BP_*
109  * counterparts.
110  */
111 #define	DDK_GET_LSIZE(ddk)	\
112 	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
113 #define	DDK_SET_LSIZE(ddk, x)	\
114 	BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
115 
116 #define	DDK_GET_PSIZE(ddk)	\
117 	BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
118 #define	DDK_SET_PSIZE(ddk, x)	\
119 	BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
120 
121 #define	DDK_GET_COMPRESS(ddk)		BF64_GET((ddk)->ddk_prop, 32, 7)
122 #define	DDK_SET_COMPRESS(ddk, x)	BF64_SET((ddk)->ddk_prop, 32, 7, x)
123 
124 #define	DDK_GET_CRYPT(ddk)		BF64_GET((ddk)->ddk_prop, 39, 1)
125 #define	DDK_SET_CRYPT(ddk, x)	BF64_SET((ddk)->ddk_prop, 39, 1, x)
126 
127 /*
128  * The "value" part for an on-disk entry. These are the "physical"
129  * characteristics of the stored block, such as its location on disk (DVAs),
130  * birth txg and ref count.
131  *
132  * The "traditional" entry has an array of four, one for each number of DVAs
133  * (copies= property) and another for additional "ditto" copies. Users of the
134  * traditional struct will specify the variant (index) of the one they want.
135  *
136  * The newer "flat" entry has only a single form that is specified using the
137  * DDT_PHYS_FLAT variant.
138  *
139  * Since the value size varies, use one of the size macros when interfacing
140  * with the ddt zap.
141  */
142 
143 #define	DDT_PHYS_MAX	(4)
144 
145 /*
146  * Note - this can be used in a flexible array and allocated for
147  * a specific size (ddp_trad or ddp_flat). So be careful not to
148  * copy using "=" assignment but instead use ddt_phys_copy().
149  */
150 typedef union {
151 	/*
152 	 * Traditional physical payload value for DDT zap (256 bytes)
153 	 */
154 	struct {
155 		dva_t		ddp_dva[SPA_DVAS_PER_BP];
156 		uint64_t	ddp_refcnt;
157 		uint64_t	ddp_phys_birth;
158 	} ddp_trad[DDT_PHYS_MAX];
159 
160 	/*
161 	 * Flat physical payload value for DDT zap (72 bytes)
162 	 */
163 	struct {
164 		dva_t		ddp_dva[SPA_DVAS_PER_BP];
165 		uint64_t	ddp_refcnt;
166 		uint64_t	ddp_phys_birth; /* txg based from BP */
167 		uint64_t	ddp_class_start; /* in realtime seconds */
168 	} ddp_flat;
169 } ddt_univ_phys_t;
170 
171 /*
172  * This enum denotes which variant of a ddt_univ_phys_t to target. For
173  * a traditional DDT entry, it represents the indexes into the ddp_trad
174  * array. Any consumer of a ddt_univ_phys_t needs to know which variant
175  * is being targeted.
176  *
177  * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
178  * we maintain the ability to free existing dedup-ditto blocks.
179  */
180 
181 typedef enum {
182 	DDT_PHYS_DITTO = 0,
183 	DDT_PHYS_SINGLE = 1,
184 	DDT_PHYS_DOUBLE = 2,
185 	DDT_PHYS_TRIPLE = 3,
186 	DDT_PHYS_FLAT = 4,
187 	DDT_PHYS_NONE = 5
188 } ddt_phys_variant_t;
189 
190 #define	DDT_PHYS_VARIANT(ddt, p)	\
191 	(ASSERT((p) < DDT_PHYS_NONE),	\
192 	((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
193 
194 #define	DDT_TRAD_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
195 #define	DDT_FLAT_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
196 
197 #define	_DDT_PHYS_SWITCH(ddt, flat, trad)	\
198 	(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
199 
200 #define	DDT_PHYS_SIZE(ddt)		_DDT_PHYS_SWITCH(ddt,	\
201 	DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
202 
203 #define	DDT_NPHYS(ddt)			_DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
204 #define	DDT_PHYS_FOR_COPIES(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, p)
205 #define	DDT_PHYS_IS_DITTO(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, (p == 0))
206 
207 /*
208  * A "live" entry, holding changes to an entry made this txg, and other data to
209  * support loading, updating and repairing the entry.
210  */
211 
212 /* State flags for dde_flags */
213 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
214 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
215 #define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
216 #define	DDE_FLAG_FROM_FLUSHING	(1 << 3)	/* loaded from flushing log */
217 
218 /*
219  * Additional data to support entry update or repair. This is fixed size
220  * because its relatively rarely used.
221  */
222 typedef struct {
223 	/* protects dde_phys, dde_orig_phys and dde_lead_zio during I/O */
224 	kmutex_t	dde_io_lock;
225 
226 	/* copy of data after a repair read, to be rewritten */
227 	abd_t		*dde_repair_abd;
228 
229 	/* original phys contents before update, for error handling */
230 	ddt_univ_phys_t	dde_orig_phys;
231 
232 	/* in-flight update IOs */
233 	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
234 } ddt_entry_io_t;
235 
236 typedef struct {
237 	/* key must be first for ddt_key_compare */
238 	ddt_key_t	dde_key;	/* ddt_tree key */
239 	avl_node_t	dde_node;	/* ddt_tree_node */
240 
241 	/* storage type and class the entry was loaded from */
242 	ddt_type_t	dde_type;
243 	ddt_class_t	dde_class;
244 
245 	uint8_t		dde_flags;	/* load state flags */
246 	kcondvar_t	dde_cv;		/* signaled when load completes */
247 	uint64_t	dde_waiters;	/* count of waiters on dde_cv */
248 
249 	ddt_entry_io_t	*dde_io;	/* IO support, when required */
250 
251 	ddt_univ_phys_t	dde_phys[];	/* flexible -- allocated size varies */
252 } ddt_entry_t;
253 
254 /*
255  * A lightweight entry is for short-lived or transient uses, like iterating or
256  * inspecting, when you don't care where it came from.
257  */
258 typedef struct {
259 	ddt_key_t	ddlwe_key;
260 	ddt_type_t	ddlwe_type;
261 	ddt_class_t	ddlwe_class;
262 	ddt_univ_phys_t	ddlwe_phys;
263 } ddt_lightweight_entry_t;
264 
265 /*
266  * In-core DDT log. A separate struct to make it easier to switch between the
267  * appending and flushing logs.
268  */
269 typedef struct {
270 	avl_tree_t	ddl_tree;	/* logged entries */
271 	uint32_t	ddl_flags;	/* flags for this log */
272 	uint64_t	ddl_object;	/* log object id */
273 	uint64_t	ddl_length;	/* on-disk log size */
274 	uint64_t	ddl_first_txg;	/* txg log became active */
275 	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
276 } ddt_log_t;
277 
278 /*
279  * In-core DDT object. This covers all entries and stats for a the whole pool
280  * for a given checksum type.
281  */
282 typedef struct {
283 	kmutex_t	ddt_lock;	/* protects changes to all fields */
284 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
285 	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
286 
287 	/* Protects ddt_object[] and ddt_object_dnode[]. */
288 	krwlock_t	ddt_objects_lock ____cacheline_aligned;
289 
290 	/*
291 	 * Log trees are stable during I/O, and only modified during sync
292 	 * with exclusive access.
293 	 */
294 	ddt_log_t	ddt_log[2] ____cacheline_aligned; /* logged entries */
295 	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
296 	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
297 
298 	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
299 	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
300 	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
301 	uint32_t	ddt_log_flush_pressure;	/* pressure to apply for cap */
302 	uint32_t	ddt_log_flush_prev_backlog; /* prev backlog size */
303 
304 	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
305 
306 	kstat_t		*ddt_ksp;	/* kstats context */
307 
308 	/* wmsums for hot-path lookup counters */
309 	wmsum_t		ddt_kstat_dds_lookup;
310 	wmsum_t		ddt_kstat_dds_lookup_live_hit;
311 	wmsum_t		ddt_kstat_dds_lookup_live_wait;
312 	wmsum_t		ddt_kstat_dds_lookup_live_miss;
313 	wmsum_t		ddt_kstat_dds_lookup_existing;
314 	wmsum_t		ddt_kstat_dds_lookup_new;
315 	wmsum_t		ddt_kstat_dds_lookup_log_hit;
316 	wmsum_t		ddt_kstat_dds_lookup_log_active_hit;
317 	wmsum_t		ddt_kstat_dds_lookup_log_flushing_hit;
318 	wmsum_t		ddt_kstat_dds_lookup_log_miss;
319 	wmsum_t		ddt_kstat_dds_lookup_stored_hit;
320 	wmsum_t		ddt_kstat_dds_lookup_stored_miss;
321 
322 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
323 	spa_t		*ddt_spa;	/* pool this ddt is on */
324 	objset_t	*ddt_os;	/* ddt objset (always MOS) */
325 
326 	uint64_t	ddt_dir_object;	/* MOS dir holding ddt objects */
327 	uint64_t	ddt_version;	/* DDT version */
328 	uint64_t	ddt_flags;	/* FDT option flags */
329 
330 	/* per-type/per-class entry store objects */
331 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
332 	dnode_t		*ddt_object_dnode[DDT_TYPES][DDT_CLASSES];
333 
334 	/* object ids for stored, logged and per-type/per-class stats */
335 	uint64_t	ddt_stat_object;
336 	ddt_object_t	ddt_log_stats;
337 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
338 
339 	/* type/class stats by power-2-sized referenced blocks */
340 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
341 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
342 
343 	/* log stats power-2-sized referenced blocks */
344 	ddt_histogram_t	ddt_log_histogram;
345 } ddt_t;
346 
347 /*
348  * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(),
349  * and is stable across calls, even if the DDT is updated, the pool is
350  * restarted or loaded on another system, or OpenZFS is upgraded.
351  */
352 typedef struct {
353 	uint64_t	ddb_class;
354 	uint64_t	ddb_type;
355 	uint64_t	ddb_checksum;
356 	uint64_t	ddb_cursor;
357 } ddt_bookmark_t;
358 
359 extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
360     blkptr_t *bp, uint64_t txg);
361 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
362     const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
363 
364 extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
365     const blkptr_t *bp);
366 extern void ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
367     ddt_phys_variant_t v);
368 extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
369     ddt_phys_variant_t v);
370 extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
371 extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
372 extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
373 extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
374     ddt_phys_variant_t v);
375 extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
376     const ddt_entry_t *dde, const blkptr_t *bp);
377 extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
378     ddt_phys_variant_t v);
379 extern int ddt_phys_is_gang(const ddt_univ_phys_t *ddp,
380     ddt_phys_variant_t v);
381 extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
382     boolean_t encrypted);
383 
384 extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
385     const ddt_lightweight_entry_t *ddlwe);
386 extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
387     const ddt_lightweight_entry_t *ddlwe);
388 
389 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
390 extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
391 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
392 
393 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
394 extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
395 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
396 extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
397 
398 extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
399 extern uint64_t ddt_get_dedup_used(spa_t *spa);
400 extern uint64_t ddt_get_dedup_saved(spa_t *spa);
401 extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
402 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);
403 
404 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
405 extern void ddt_enter(ddt_t *ddt);
406 extern void ddt_exit(ddt_t *ddt);
407 extern void ddt_init(void);
408 extern void ddt_fini(void);
409 extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp,
410     boolean_t verify);
411 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
412 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
413 extern void ddt_prefetch_all(spa_t *spa);
414 
415 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
416     const blkptr_t *bp);
417 
418 extern void ddt_alloc_entry_io(ddt_entry_t *dde);
419 
420 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
421 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
422 
423 extern int ddt_key_compare(const void *x1, const void *x2);
424 
425 extern void ddt_create(spa_t *spa);
426 extern int ddt_load(spa_t *spa);
427 extern void ddt_unload(spa_t *spa);
428 extern void ddt_sync(spa_t *spa, uint64_t txg);
429 
430 extern void ddt_walk_init(spa_t *spa, uint64_t txg);
431 extern boolean_t ddt_walk_ready(spa_t *spa);
432 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
433     ddt_lightweight_entry_t *ddlwe);
434 
435 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
436 
437 extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
438     uint64_t amount);
439 
440 #ifdef	__cplusplus
441 }
442 #endif
443 
444 #endif	/* _SYS_DDT_H */
445