xref: /src/sys/contrib/openzfs/include/sys/btree.h (revision 8a62a2a5659d1839d8799b4274c04469d7f17c78)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * This file and its contents are supplied under the terms of the
6  * Common Development and Distribution License ("CDDL"), version 1.0.
7  * You may only use this file in accordance with the terms of version
8  * 1.0 of the CDDL.
9  *
10  * A full copy of the text of the CDDL should have accompanied this
11  * source.  A copy of the CDDL is also available via the Internet at
12  * http://www.illumos.org/license/CDDL.
13  *
14  * CDDL HEADER END
15  */
16 /*
17  * Copyright (c) 2019 by Delphix. All rights reserved.
18  */
19 
20 #ifndef	_BTREE_H
21 #define	_BTREE_H
22 
23 #ifdef	__cplusplus
24 extern "C" {
25 #endif
26 
27 #include	<sys/zfs_context.h>
28 
29 /*
30  * This file defines the interface for a B-Tree implementation for ZFS. The
31  * tree can be used to store arbitrary sortable data types with low overhead
32  * and good operation performance. In addition the tree intelligently
33  * optimizes bulk in-order insertions to improve memory use and performance.
34  *
35  * Note that for all B-Tree functions, the values returned are pointers to the
36  * internal copies of the data in the tree. The internal data can only be
37  * safely mutated if the changes cannot change the ordering of the element
38  * with respect to any other elements in the tree.
39  *
40  * The major drawback of the B-Tree is that any returned elements or indexes
41  * are only valid until a side-effectful operation occurs, since these can
42  * result in reallocation or relocation of data. Side effectful operations are
43  * defined as insertion, removal, and zfs_btree_destroy_nodes.
44  *
45  * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core
46  * nodes have an array of children pointing to other nodes, and an array of
47  * elements that act as separators between the elements of the subtrees rooted
48  * at its children. Leaf nodes only contain data elements, and form the bottom
49  * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the
50  * elements in the core nodes are not copies of or references to leaf node
51  * elements.  Each element occurs only once in the tree, no matter what kind
52  * of node it is in.
53  *
54  * The tree's height is the same throughout, unlike many other forms of search
55  * tree. Each node (except for the root) must be between half minus one and
56  * completely full of elements (and children) at all times. Any operation that
57  * would put the node outside of that range results in a rebalancing operation
58  * (taking, merging, or splitting).
59  *
60  * This tree was implemented using descriptions from Wikipedia's articles on
61  * B-Trees and B+ Trees.
62  */
63 
64 /*
65  * Decreasing these values results in smaller memmove operations, but more of
66  * them, and increased memory overhead. Increasing these values results in
67  * higher variance in operation time, and reduces memory overhead.
68  */
69 #define	BTREE_CORE_ELEMS	126
70 #define	BTREE_LEAF_SIZE		4096
71 
72 extern kmem_cache_t *zfs_btree_leaf_cache;
73 
74 typedef struct zfs_btree_hdr {
75 	struct zfs_btree_core	*bth_parent;
76 	/*
77 	 * Set to -1 to indicate core nodes. Other values represent first
78 	 * valid element offset for leaf nodes.
79 	 */
80 	uint32_t		bth_first;
81 	/*
82 	 * For both leaf and core nodes, represents the number of elements in
83 	 * the node. For core nodes, they will have bth_count + 1 children.
84 	 */
85 	uint32_t		bth_count;
86 } zfs_btree_hdr_t;
87 
88 typedef struct zfs_btree_core {
89 	zfs_btree_hdr_t	btc_hdr;
90 	zfs_btree_hdr_t	*btc_children[BTREE_CORE_ELEMS + 1];
91 	uint8_t		btc_elems[];
92 } zfs_btree_core_t;
93 
94 typedef struct zfs_btree_leaf {
95 	zfs_btree_hdr_t	btl_hdr;
96 	uint8_t		btl_elems[];
97 } zfs_btree_leaf_t;
98 
99 typedef struct zfs_btree_index {
100 	zfs_btree_hdr_t	*bti_node;
101 	uint32_t	bti_offset;
102 	/*
103 	 * True if the location is before the list offset, false if it's at
104 	 * the listed offset.
105 	 */
106 	boolean_t	bti_before;
107 } zfs_btree_index_t;
108 
109 typedef struct btree zfs_btree_t;
110 typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
111     const void *, zfs_btree_index_t *);
112 
113 struct btree {
114 	int (*bt_compar) (const void *, const void *);
115 	bt_find_in_buf_f	bt_find_in_buf;
116 	size_t			bt_elem_size;
117 	size_t			bt_leaf_size;
118 	uint32_t		bt_leaf_cap;
119 	int32_t			bt_height;
120 	uint64_t		bt_num_elems;
121 	uint64_t		bt_num_nodes;
122 	zfs_btree_hdr_t		*bt_root;
123 	zfs_btree_leaf_t	*bt_bulk; // non-null if bulk loading
124 };
125 
126 /*
127  * Implementation of Shar's algorithm designed to accelerate binary search by
128  * eliminating impossible to predict branches.
129  *
130  * For optimality, this should be used to generate the search function in the
131  * same file as the comparator  and the comparator should be marked
132  * `__attribute__((always_inline) inline` so that the compiler will inline it.
133  *
134  * Arguments are:
135  *
136  * NAME   - The function name for this instance of the search function. Use it
137  *          in a subsequent call to zfs_btree_create().
138  * T      - The element type stored inside the B-Tree.
139  * COMP   - A comparator to compare two nodes, it must return exactly: -1, 0,
140  *          or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
141  *          TREE_CMP() from avl.h can be used in a boilerplate function.
142  */
143 /* BEGIN CSTYLED */
144 #define	ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP)			\
145 _Pragma("GCC diagnostic push")						\
146 _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")			\
147 static void *								\
148 NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,			\
149     const void *value, zfs_btree_index_t *where)			\
150 {									\
151 	T *i = (T *)buf;						\
152 	(void) tree;							\
153 	_Pragma("GCC unroll 9")						\
154 	while (nelems > 1) {						\
155 		uint32_t half = nelems / 2;				\
156 		nelems -= half;						\
157 		i += (COMP(&i[half - 1], value) < 0) * half;		\
158 	}								\
159 									\
160 	int comp = COMP(i, value);					\
161 	where->bti_offset = (i - (T *)buf) + (comp < 0);		\
162 	where->bti_before = (comp != 0);				\
163 									\
164 	if (comp == 0) {						\
165 		return (i);						\
166 	}								\
167 									\
168 	return (NULL);							\
169 }									\
170 _Pragma("GCC diagnostic pop")
171 /* END CSTYLED */
172 
173 /*
174  * Allocate and deallocate caches for btree nodes.
175  */
176 void zfs_btree_init(void);
177 void zfs_btree_fini(void);
178 
179 /*
180  * Initialize an B-Tree. Arguments are:
181  *
182  * tree   - the tree to be initialized
183  * compar - function to compare two nodes, it must return exactly: -1, 0, or +1
184  *          -1 for <, 0 for ==, and +1 for >
185  * find   - optional function to accelerate searches inside B-Tree nodes
186  *          through Shar's algorithm and comparator inlining. Setting this to
187  *          NULL will use a generic function. The function should be created
188  *          using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
189  *          compar should be marked `__attribute__((always_inline)) inline` or
190  *          performance is unlikely to improve very much.
191  * size   - the value of sizeof(struct my_type)
192  * lsize  - custom leaf size
193  */
194 void zfs_btree_create(zfs_btree_t *,
195     int (*) (const void *, const void *),
196     bt_find_in_buf_f, size_t);
197 void zfs_btree_create_custom(zfs_btree_t *,
198     int (*)(const void *, const void *),
199     bt_find_in_buf_f, size_t, size_t);
200 
201 /*
202  * Find a node with a matching value in the tree. Returns the matching node
203  * found. If not found, it returns NULL and then if "where" is not NULL it sets
204  * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest().
205  *
206  * node   - node that has the value being looked for
207  * where  - position for use with zfs_btree_nearest() or zfs_btree_add_idx(),
208  *          may be NULL
209  */
210 void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *);
211 
212 /*
213  * Insert a node into the tree.
214  *
215  * node   - the node to insert
216  * where  - position as returned from zfs_btree_find()
217  */
218 void zfs_btree_add_idx(zfs_btree_t *, const void *,
219     const zfs_btree_index_t *);
220 
221 /*
222  * Return the first or last valued node in the tree. Will return NULL if the
223  * tree is empty. The index can be NULL if the location of the first or last
224  * element isn't required.
225  */
226 void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *);
227 void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *);
228 
229 /*
230  * Return the next or previous valued node in the tree. The second index can
231  * safely be NULL, if the location of the next or previous value isn't
232  * required.
233  */
234 void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *,
235     zfs_btree_index_t *);
236 void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *,
237     zfs_btree_index_t *);
238 
239 /*
240  * Get a value from a tree and an index.
241  */
242 void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *);
243 
244 /*
245  * Add a single value to the tree. The value must not compare equal to any
246  * other node already in the tree. Note that the value will be copied out, not
247  * inserted directly. It is safe to free or destroy the value once this
248  * function returns.
249  */
250 void zfs_btree_add(zfs_btree_t *, const void *);
251 
252 /*
253  * Remove a single value from the tree.  The value must be in the tree. The
254  * pointer passed in may be a pointer into a tree-controlled buffer, but it
255  * need not be.
256  */
257 void zfs_btree_remove(zfs_btree_t *, const void *);
258 
259 /*
260  * Remove the value at the given location from the tree.
261  */
262 void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *);
263 
264 /*
265  * Return the number of nodes in the tree
266  */
267 ulong_t zfs_btree_numnodes(zfs_btree_t *);
268 
269 /*
270  * Used to destroy any remaining nodes in a tree. The cookie argument should
271  * be initialized to NULL before the first call. Returns a node that has been
272  * removed from the tree and may be free()'d. Returns NULL when the tree is
273  * empty.
274  *
275  * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it
276  * and finally zfs_btree_destroy(). No other B-Tree routines will be valid.
277  *
278  * cookie - an index used to save state between calls to
279  * zfs_btree_destroy_nodes()
280  *
281  * EXAMPLE:
282  *	zfs_btree_t *tree;
283  *	struct my_data *node;
284  *	zfs_btree_index_t *cookie;
285  *
286  *	cookie = NULL;
287  *	while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
288  *		data_destroy(node);
289  *	zfs_btree_destroy(tree);
290  */
291 void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **);
292 
293 /*
294  * Destroys all nodes in the tree quickly. This doesn't give the caller an
295  * opportunity to iterate over each node and do its own cleanup; for that, use
296  * zfs_btree_destroy_nodes().
297  */
298 void zfs_btree_clear(zfs_btree_t *);
299 
300 /*
301  * Final destroy of an B-Tree. Arguments are:
302  *
303  * tree   - the empty tree to destroy
304  */
305 void zfs_btree_destroy(zfs_btree_t *tree);
306 
307 /* Runs a variety of self-checks on the btree to verify integrity. */
308 void zfs_btree_verify(zfs_btree_t *tree);
309 
310 #ifdef	__cplusplus
311 }
312 #endif
313 
314 #endif	/* _BTREE_H */
315