1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * This file and its contents are supplied under the terms of the 6 * Common Development and Distribution License ("CDDL"), version 1.0. 7 * You may only use this file in accordance with the terms of version 8 * 1.0 of the CDDL. 9 * 10 * A full copy of the text of the CDDL should have accompanied this 11 * source. A copy of the CDDL is also available via the Internet at 12 * http://www.illumos.org/license/CDDL. 13 * 14 * CDDL HEADER END 15 */ 16 /* 17 * Copyright (c) 2019 by Delphix. All rights reserved. 18 */ 19 20 #ifndef _BTREE_H 21 #define _BTREE_H 22 23 #ifdef __cplusplus 24 extern "C" { 25 #endif 26 27 #include <sys/zfs_context.h> 28 29 /* 30 * This file defines the interface for a B-Tree implementation for ZFS. The 31 * tree can be used to store arbitrary sortable data types with low overhead 32 * and good operation performance. In addition the tree intelligently 33 * optimizes bulk in-order insertions to improve memory use and performance. 34 * 35 * Note that for all B-Tree functions, the values returned are pointers to the 36 * internal copies of the data in the tree. The internal data can only be 37 * safely mutated if the changes cannot change the ordering of the element 38 * with respect to any other elements in the tree. 39 * 40 * The major drawback of the B-Tree is that any returned elements or indexes 41 * are only valid until a side-effectful operation occurs, since these can 42 * result in reallocation or relocation of data. Side effectful operations are 43 * defined as insertion, removal, and zfs_btree_destroy_nodes. 44 * 45 * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core 46 * nodes have an array of children pointing to other nodes, and an array of 47 * elements that act as separators between the elements of the subtrees rooted 48 * at its children. Leaf nodes only contain data elements, and form the bottom 49 * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the 50 * elements in the core nodes are not copies of or references to leaf node 51 * elements. Each element occurs only once in the tree, no matter what kind 52 * of node it is in. 53 * 54 * The tree's height is the same throughout, unlike many other forms of search 55 * tree. Each node (except for the root) must be between half minus one and 56 * completely full of elements (and children) at all times. Any operation that 57 * would put the node outside of that range results in a rebalancing operation 58 * (taking, merging, or splitting). 59 * 60 * This tree was implemented using descriptions from Wikipedia's articles on 61 * B-Trees and B+ Trees. 62 */ 63 64 /* 65 * Decreasing these values results in smaller memmove operations, but more of 66 * them, and increased memory overhead. Increasing these values results in 67 * higher variance in operation time, and reduces memory overhead. 68 */ 69 #define BTREE_CORE_ELEMS 126 70 #define BTREE_LEAF_SIZE 4096 71 72 extern kmem_cache_t *zfs_btree_leaf_cache; 73 74 typedef struct zfs_btree_hdr { 75 struct zfs_btree_core *bth_parent; 76 /* 77 * Set to -1 to indicate core nodes. Other values represent first 78 * valid element offset for leaf nodes. 79 */ 80 uint32_t bth_first; 81 /* 82 * For both leaf and core nodes, represents the number of elements in 83 * the node. For core nodes, they will have bth_count + 1 children. 84 */ 85 uint32_t bth_count; 86 } zfs_btree_hdr_t; 87 88 typedef struct zfs_btree_core { 89 zfs_btree_hdr_t btc_hdr; 90 zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; 91 uint8_t btc_elems[]; 92 } zfs_btree_core_t; 93 94 typedef struct zfs_btree_leaf { 95 zfs_btree_hdr_t btl_hdr; 96 uint8_t btl_elems[]; 97 } zfs_btree_leaf_t; 98 99 typedef struct zfs_btree_index { 100 zfs_btree_hdr_t *bti_node; 101 uint32_t bti_offset; 102 /* 103 * True if the location is before the list offset, false if it's at 104 * the listed offset. 105 */ 106 boolean_t bti_before; 107 } zfs_btree_index_t; 108 109 typedef struct btree zfs_btree_t; 110 typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, 111 const void *, zfs_btree_index_t *); 112 113 struct btree { 114 int (*bt_compar) (const void *, const void *); 115 bt_find_in_buf_f bt_find_in_buf; 116 size_t bt_elem_size; 117 size_t bt_leaf_size; 118 uint32_t bt_leaf_cap; 119 int32_t bt_height; 120 uint64_t bt_num_elems; 121 uint64_t bt_num_nodes; 122 zfs_btree_hdr_t *bt_root; 123 zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading 124 }; 125 126 /* 127 * Implementation of Shar's algorithm designed to accelerate binary search by 128 * eliminating impossible to predict branches. 129 * 130 * For optimality, this should be used to generate the search function in the 131 * same file as the comparator and the comparator should be marked 132 * `__attribute__((always_inline) inline` so that the compiler will inline it. 133 * 134 * Arguments are: 135 * 136 * NAME - The function name for this instance of the search function. Use it 137 * in a subsequent call to zfs_btree_create(). 138 * T - The element type stored inside the B-Tree. 139 * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, 140 * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, 141 * TREE_CMP() from avl.h can be used in a boilerplate function. 142 */ 143 /* BEGIN CSTYLED */ 144 #define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ 145 _Pragma("GCC diagnostic push") \ 146 _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ 147 static void * \ 148 NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ 149 const void *value, zfs_btree_index_t *where) \ 150 { \ 151 T *i = (T *)buf; \ 152 (void) tree; \ 153 _Pragma("GCC unroll 9") \ 154 while (nelems > 1) { \ 155 uint32_t half = nelems / 2; \ 156 nelems -= half; \ 157 i += (COMP(&i[half - 1], value) < 0) * half; \ 158 } \ 159 \ 160 int comp = COMP(i, value); \ 161 where->bti_offset = (i - (T *)buf) + (comp < 0); \ 162 where->bti_before = (comp != 0); \ 163 \ 164 if (comp == 0) { \ 165 return (i); \ 166 } \ 167 \ 168 return (NULL); \ 169 } \ 170 _Pragma("GCC diagnostic pop") 171 /* END CSTYLED */ 172 173 /* 174 * Allocate and deallocate caches for btree nodes. 175 */ 176 void zfs_btree_init(void); 177 void zfs_btree_fini(void); 178 179 /* 180 * Initialize an B-Tree. Arguments are: 181 * 182 * tree - the tree to be initialized 183 * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 184 * -1 for <, 0 for ==, and +1 for > 185 * find - optional function to accelerate searches inside B-Tree nodes 186 * through Shar's algorithm and comparator inlining. Setting this to 187 * NULL will use a generic function. The function should be created 188 * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. 189 * compar should be marked `__attribute__((always_inline)) inline` or 190 * performance is unlikely to improve very much. 191 * size - the value of sizeof(struct my_type) 192 * lsize - custom leaf size 193 */ 194 void zfs_btree_create(zfs_btree_t *, 195 int (*) (const void *, const void *), 196 bt_find_in_buf_f, size_t); 197 void zfs_btree_create_custom(zfs_btree_t *, 198 int (*)(const void *, const void *), 199 bt_find_in_buf_f, size_t, size_t); 200 201 /* 202 * Find a node with a matching value in the tree. Returns the matching node 203 * found. If not found, it returns NULL and then if "where" is not NULL it sets 204 * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest(). 205 * 206 * node - node that has the value being looked for 207 * where - position for use with zfs_btree_nearest() or zfs_btree_add_idx(), 208 * may be NULL 209 */ 210 void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); 211 212 /* 213 * Insert a node into the tree. 214 * 215 * node - the node to insert 216 * where - position as returned from zfs_btree_find() 217 */ 218 void zfs_btree_add_idx(zfs_btree_t *, const void *, 219 const zfs_btree_index_t *); 220 221 /* 222 * Return the first or last valued node in the tree. Will return NULL if the 223 * tree is empty. The index can be NULL if the location of the first or last 224 * element isn't required. 225 */ 226 void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); 227 void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); 228 229 /* 230 * Return the next or previous valued node in the tree. The second index can 231 * safely be NULL, if the location of the next or previous value isn't 232 * required. 233 */ 234 void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, 235 zfs_btree_index_t *); 236 void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, 237 zfs_btree_index_t *); 238 239 /* 240 * Get a value from a tree and an index. 241 */ 242 void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); 243 244 /* 245 * Add a single value to the tree. The value must not compare equal to any 246 * other node already in the tree. Note that the value will be copied out, not 247 * inserted directly. It is safe to free or destroy the value once this 248 * function returns. 249 */ 250 void zfs_btree_add(zfs_btree_t *, const void *); 251 252 /* 253 * Remove a single value from the tree. The value must be in the tree. The 254 * pointer passed in may be a pointer into a tree-controlled buffer, but it 255 * need not be. 256 */ 257 void zfs_btree_remove(zfs_btree_t *, const void *); 258 259 /* 260 * Remove the value at the given location from the tree. 261 */ 262 void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *); 263 264 /* 265 * Return the number of nodes in the tree 266 */ 267 ulong_t zfs_btree_numnodes(zfs_btree_t *); 268 269 /* 270 * Used to destroy any remaining nodes in a tree. The cookie argument should 271 * be initialized to NULL before the first call. Returns a node that has been 272 * removed from the tree and may be free()'d. Returns NULL when the tree is 273 * empty. 274 * 275 * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it 276 * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. 277 * 278 * cookie - an index used to save state between calls to 279 * zfs_btree_destroy_nodes() 280 * 281 * EXAMPLE: 282 * zfs_btree_t *tree; 283 * struct my_data *node; 284 * zfs_btree_index_t *cookie; 285 * 286 * cookie = NULL; 287 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) 288 * data_destroy(node); 289 * zfs_btree_destroy(tree); 290 */ 291 void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); 292 293 /* 294 * Destroys all nodes in the tree quickly. This doesn't give the caller an 295 * opportunity to iterate over each node and do its own cleanup; for that, use 296 * zfs_btree_destroy_nodes(). 297 */ 298 void zfs_btree_clear(zfs_btree_t *); 299 300 /* 301 * Final destroy of an B-Tree. Arguments are: 302 * 303 * tree - the empty tree to destroy 304 */ 305 void zfs_btree_destroy(zfs_btree_t *tree); 306 307 /* Runs a variety of self-checks on the btree to verify integrity. */ 308 void zfs_btree_verify(zfs_btree_t *tree); 309 310 #ifdef __cplusplus 311 } 312 #endif 313 314 #endif /* _BTREE_H */ 315