1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * This file and its contents are supplied under the terms of the
6 * Common Development and Distribution License ("CDDL"), version 1.0.
7 * You may only use this file in accordance with the terms of version
8 * 1.0 of the CDDL.
9 *
10 * A full copy of the text of the CDDL should have accompanied this
11 * source. A copy of the CDDL is also available via the Internet at
12 * http://www.illumos.org/license/CDDL.
13 *
14 * CDDL HEADER END
15 */
16 /*
17 * Copyright (c) 2019 by Delphix. All rights reserved.
18 */
19
20 #include <sys/btree.h>
21 #include <sys/bitops.h>
22 #include <sys/zfs_context.h>
23
24 #ifndef _KERNEL
25 #define panic(...) PANIC(__VA_ARGS__)
26 #endif
27
28 kmem_cache_t *zfs_btree_leaf_cache = NULL;
29
30 /*
31 * Control the extent of the verification that occurs when zfs_btree_verify is
32 * called. Primarily used for debugging when extending the btree logic and
33 * functionality. As the intensity is increased, new verification steps are
34 * added. These steps are cumulative; intensity = 3 includes the intensity = 1
35 * and intensity = 2 steps as well.
36 *
37 * Intensity 1: Verify that the tree's height is consistent throughout.
38 * Intensity 2: Verify that a core node's children's parent pointers point
39 * to the core node.
40 * Intensity 3: Verify that the total number of elements in the tree matches the
41 * sum of the number of elements in each node. Also verifies that each node's
42 * count obeys the invariants (less than or equal to maximum value, greater than
43 * or equal to half the maximum minus one).
44 * Intensity 4: Verify that each element compares less than the element
45 * immediately after it and greater than the one immediately before it using the
46 * comparator function. For core nodes, also checks that each element is greater
47 * than the last element in the first of the two nodes it separates, and less
48 * than the first element in the second of the two nodes.
49 * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
50 * of each node is poisoned appropriately. Note that poisoning always occurs if
51 * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
52 * operation.
53 *
54 * Intensity 4 and 5 are particularly expensive to perform; the previous levels
55 * are a few memory operations per node, while these levels require multiple
56 * operations per element. In addition, when creating large btrees, these
57 * operations are called at every step, resulting in extremely slow operation
58 * (while the asymptotic complexity of the other steps is the same, the
59 * importance of the constant factors cannot be denied).
60 */
61 uint_t zfs_btree_verify_intensity = 0;
62
63 /*
64 * Convenience functions to silence warnings from memcpy/memmove's
65 * return values and change argument order to src, dest.
66 */
67 static void
bcpy(const void * src,void * dest,size_t size)68 bcpy(const void *src, void *dest, size_t size)
69 {
70 (void) memcpy(dest, src, size);
71 }
72
73 static void
bmov(const void * src,void * dest,size_t size)74 bmov(const void *src, void *dest, size_t size)
75 {
76 (void) memmove(dest, src, size);
77 }
78
79 static boolean_t
zfs_btree_is_core(struct zfs_btree_hdr * hdr)80 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
81 {
82 return (hdr->bth_first == -1);
83 }
84
85 #ifdef _ILP32
86 #define BTREE_POISON 0xabadb10c
87 #else
88 #define BTREE_POISON 0xabadb10cdeadbeef
89 #endif
90
91 static void
zfs_btree_poison_node(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)92 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
93 {
94 #ifdef ZFS_DEBUG
95 size_t size = tree->bt_elem_size;
96 if (zfs_btree_is_core(hdr)) {
97 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
98 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
99 i++) {
100 node->btc_children[i] =
101 (zfs_btree_hdr_t *)BTREE_POISON;
102 }
103 (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
104 (BTREE_CORE_ELEMS - hdr->bth_count) * size);
105 } else {
106 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
107 (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
108 (void) memset(leaf->btl_elems +
109 (hdr->bth_first + hdr->bth_count) * size, 0x0f,
110 tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
111 (hdr->bth_first + hdr->bth_count) * size);
112 }
113 #else
114 (void) tree; (void) hdr;
115 #endif
116 }
117
118 static inline void
zfs_btree_poison_node_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx,uint32_t count)119 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
120 uint32_t idx, uint32_t count)
121 {
122 #ifdef ZFS_DEBUG
123 size_t size = tree->bt_elem_size;
124 if (zfs_btree_is_core(hdr)) {
125 ASSERT3U(idx, >=, hdr->bth_count);
126 ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
127 ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
128 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
129 for (uint32_t i = 1; i <= count; i++) {
130 node->btc_children[idx + i] =
131 (zfs_btree_hdr_t *)BTREE_POISON;
132 }
133 (void) memset(node->btc_elems + idx * size, 0x0f, count * size);
134 } else {
135 ASSERT3U(idx, <=, tree->bt_leaf_cap);
136 ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
137 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
138 (void) memset(leaf->btl_elems +
139 (hdr->bth_first + idx) * size, 0x0f, count * size);
140 }
141 #else
142 (void) tree; (void) hdr; (void) idx; (void) count;
143 #endif
144 }
145
146 static inline void
zfs_btree_verify_poison_at(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,uint32_t idx)147 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
148 uint32_t idx)
149 {
150 #ifdef ZFS_DEBUG
151 size_t size = tree->bt_elem_size;
152 if (zfs_btree_is_core(hdr)) {
153 ASSERT3U(idx, <, BTREE_CORE_ELEMS);
154 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
155 zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
156 VERIFY3P(node->btc_children[idx + 1], ==, cval);
157 for (size_t i = 0; i < size; i++)
158 VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
159 } else {
160 ASSERT3U(idx, <, tree->bt_leaf_cap);
161 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
162 if (idx >= tree->bt_leaf_cap - hdr->bth_first)
163 return;
164 for (size_t i = 0; i < size; i++) {
165 VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
166 * size + i], ==, 0x0f);
167 }
168 }
169 #else
170 (void) tree; (void) hdr; (void) idx;
171 #endif
172 }
173
174 void
zfs_btree_init(void)175 zfs_btree_init(void)
176 {
177 zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
178 BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
179 }
180
181 void
zfs_btree_fini(void)182 zfs_btree_fini(void)
183 {
184 kmem_cache_destroy(zfs_btree_leaf_cache);
185 }
186
187 static void *
zfs_btree_leaf_alloc(zfs_btree_t * tree)188 zfs_btree_leaf_alloc(zfs_btree_t *tree)
189 {
190 if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
191 return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
192 else
193 return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
194 }
195
196 static void
zfs_btree_leaf_free(zfs_btree_t * tree,void * ptr)197 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
198 {
199 if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
200 return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
201 else
202 return (kmem_free(ptr, tree->bt_leaf_size));
203 }
204
205 void
zfs_btree_create(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size)206 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
207 bt_find_in_buf_f bt_find_in_buf, size_t size)
208 {
209 /* Verify zfs_btree_init() was called before zfs_btree_create() */
210 ASSERT(zfs_btree_leaf_cache != NULL);
211
212 zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
213 BTREE_LEAF_SIZE);
214 }
215
216 static void *
217 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
218 const void *value, zfs_btree_index_t *where);
219
220 void
zfs_btree_create_custom(zfs_btree_t * tree,int (* compar)(const void *,const void *),bt_find_in_buf_f bt_find_in_buf,size_t size,size_t lsize)221 zfs_btree_create_custom(zfs_btree_t *tree,
222 int (*compar) (const void *, const void *),
223 bt_find_in_buf_f bt_find_in_buf,
224 size_t size, size_t lsize)
225 {
226 size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
227
228 ASSERT3U(size, <=, esize / 2);
229 memset(tree, 0, sizeof (*tree));
230 tree->bt_compar = compar;
231 tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
232 zfs_btree_find_in_buf : bt_find_in_buf;
233 tree->bt_elem_size = size;
234 tree->bt_leaf_size = lsize;
235 tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
236 tree->bt_height = -1;
237 tree->bt_bulk = NULL;
238 }
239
240 /*
241 * Find value in the array of elements provided. Uses a simple binary search.
242 */
243 static void *
zfs_btree_find_in_buf(zfs_btree_t * tree,uint8_t * buf,uint32_t nelems,const void * value,zfs_btree_index_t * where)244 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
245 const void *value, zfs_btree_index_t *where)
246 {
247 uint32_t max = nelems;
248 uint32_t min = 0;
249 while (max > min) {
250 uint32_t idx = (min + max) / 2;
251 uint8_t *cur = buf + idx * tree->bt_elem_size;
252 int comp = tree->bt_compar(cur, value);
253 if (comp < 0) {
254 min = idx + 1;
255 } else if (comp > 0) {
256 max = idx;
257 } else {
258 where->bti_offset = idx;
259 where->bti_before = B_FALSE;
260 return (cur);
261 }
262 }
263
264 where->bti_offset = max;
265 where->bti_before = B_TRUE;
266 return (NULL);
267 }
268
269 /*
270 * Find the given value in the tree. where may be passed as null to use as a
271 * membership test or if the btree is being used as a map.
272 */
273 void *
zfs_btree_find(zfs_btree_t * tree,const void * value,zfs_btree_index_t * where)274 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
275 {
276 if (tree->bt_height == -1) {
277 if (where != NULL) {
278 where->bti_node = NULL;
279 where->bti_offset = 0;
280 }
281 ASSERT0(tree->bt_num_elems);
282 return (NULL);
283 }
284
285 /*
286 * If we're in bulk-insert mode, we check the last spot in the tree
287 * and the last leaf in the tree before doing the normal search,
288 * because for most workloads the vast majority of finds in
289 * bulk-insert mode are to insert new elements.
290 */
291 zfs_btree_index_t idx;
292 size_t size = tree->bt_elem_size;
293 if (tree->bt_bulk != NULL) {
294 zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
295 int comp = tree->bt_compar(last_leaf->btl_elems +
296 (last_leaf->btl_hdr.bth_first +
297 last_leaf->btl_hdr.bth_count - 1) * size, value);
298 if (comp < 0) {
299 /*
300 * If what they're looking for is after the last
301 * element, it's not in the tree.
302 */
303 if (where != NULL) {
304 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
305 where->bti_offset =
306 last_leaf->btl_hdr.bth_count;
307 where->bti_before = B_TRUE;
308 }
309 return (NULL);
310 } else if (comp == 0) {
311 if (where != NULL) {
312 where->bti_node = (zfs_btree_hdr_t *)last_leaf;
313 where->bti_offset =
314 last_leaf->btl_hdr.bth_count - 1;
315 where->bti_before = B_FALSE;
316 }
317 return (last_leaf->btl_elems +
318 (last_leaf->btl_hdr.bth_first +
319 last_leaf->btl_hdr.bth_count - 1) * size);
320 }
321 if (tree->bt_compar(last_leaf->btl_elems +
322 last_leaf->btl_hdr.bth_first * size, value) <= 0) {
323 /*
324 * If what they're looking for is after the first
325 * element in the last leaf, it's in the last leaf or
326 * it's not in the tree.
327 */
328 void *d = tree->bt_find_in_buf(tree,
329 last_leaf->btl_elems +
330 last_leaf->btl_hdr.bth_first * size,
331 last_leaf->btl_hdr.bth_count, value, &idx);
332
333 if (where != NULL) {
334 idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
335 *where = idx;
336 }
337 return (d);
338 }
339 }
340
341 zfs_btree_core_t *node = NULL;
342 uint32_t child = 0;
343 uint32_t depth = 0;
344
345 /*
346 * Iterate down the tree, finding which child the value should be in
347 * by comparing with the separators.
348 */
349 for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
350 node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
351 ASSERT3P(node, !=, NULL);
352 void *d = tree->bt_find_in_buf(tree, node->btc_elems,
353 node->btc_hdr.bth_count, value, &idx);
354 EQUIV(d != NULL, !idx.bti_before);
355 if (d != NULL) {
356 if (where != NULL) {
357 idx.bti_node = (zfs_btree_hdr_t *)node;
358 *where = idx;
359 }
360 return (d);
361 }
362 ASSERT(idx.bti_before);
363 child = idx.bti_offset;
364 }
365
366 /*
367 * The value is in this leaf, or it would be if it were in the
368 * tree. Find its proper location and return it.
369 */
370 zfs_btree_leaf_t *leaf = (depth == 0 ?
371 (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
372 void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
373 leaf->btl_hdr.bth_first * size,
374 leaf->btl_hdr.bth_count, value, &idx);
375
376 if (where != NULL) {
377 idx.bti_node = (zfs_btree_hdr_t *)leaf;
378 *where = idx;
379 }
380
381 return (d);
382 }
383
384 /*
385 * To explain the following functions, it is useful to understand the four
386 * kinds of shifts used in btree operation. First, a shift is a movement of
387 * elements within a node. It is used to create gaps for inserting new
388 * elements and children, or cover gaps created when things are removed. A
389 * shift has two fundamental properties, each of which can be one of two
390 * values, making four types of shifts. There is the direction of the shift
391 * (left or right) and the shape of the shift (parallelogram or isoceles
392 * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
393 * applies to shifts of core nodes.
394 *
395 * The names derive from the following imagining of the layout of a node:
396 *
397 * Elements: * * * * * * * ... * * *
398 * Children: * * * * * * * * ... * * *
399 *
400 * This layout follows from the fact that the elements act as separators
401 * between pairs of children, and that children root subtrees "below" the
402 * current node. A left and right shift are fairly self-explanatory; a left
403 * shift moves things to the left, while a right shift moves things to the
404 * right. A parallelogram shift is a shift with the same number of elements
405 * and children being moved, while a trapezoid shift is a shift that moves one
406 * more children than elements. An example follows:
407 *
408 * A parallelogram shift could contain the following:
409 * _______________
410 * \* * * * \ * * * ... * * *
411 * * \ * * * *\ * * * ... * * *
412 * ---------------
413 * A trapezoid shift could contain the following:
414 * ___________
415 * * / * * * \ * * * ... * * *
416 * * / * * * *\ * * * ... * * *
417 * ---------------
418 *
419 * Note that a parallelogram shift is always shaped like a "left-leaning"
420 * parallelogram, where the starting index of the children being moved is
421 * always one higher than the starting index of the elements being moved. No
422 * "right-leaning" parallelogram shifts are needed (shifts where the starting
423 * element index and starting child index being moved are the same) to achieve
424 * any btree operations, so we ignore them.
425 */
426
427 enum bt_shift_shape {
428 BSS_TRAPEZOID,
429 BSS_PARALLELOGRAM
430 };
431
432 enum bt_shift_direction {
433 BSD_LEFT,
434 BSD_RIGHT
435 };
436
437 /*
438 * Shift elements and children in the provided core node by off spots. The
439 * first element moved is idx, and count elements are moved. The shape of the
440 * shift is determined by shape. The direction is determined by dir.
441 */
442 static inline void
bt_shift_core(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_shape shape,enum bt_shift_direction dir)443 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
444 uint32_t count, uint32_t off, enum bt_shift_shape shape,
445 enum bt_shift_direction dir)
446 {
447 size_t size = tree->bt_elem_size;
448 ASSERT(zfs_btree_is_core(&node->btc_hdr));
449
450 uint8_t *e_start = node->btc_elems + idx * size;
451 uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
452 e_start + off * size);
453 bmov(e_start, e_out, count * size);
454
455 zfs_btree_hdr_t **c_start = node->btc_children + idx +
456 (shape == BSS_TRAPEZOID ? 0 : 1);
457 zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
458 c_start + off);
459 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
460 bmov(c_start, c_out, c_count * sizeof (*c_start));
461 }
462
463 /*
464 * Shift elements and children in the provided core node left by one spot.
465 * The first element moved is idx, and count elements are moved. The
466 * shape of the shift is determined by trap; true if the shift is a trapezoid,
467 * false if it is a parallelogram.
468 */
469 static inline void
bt_shift_core_left(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)470 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
471 uint32_t count, enum bt_shift_shape shape)
472 {
473 bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
474 }
475
476 /*
477 * Shift elements and children in the provided core node right by one spot.
478 * Starts with elements[idx] and children[idx] and one more child than element.
479 */
480 static inline void
bt_shift_core_right(zfs_btree_t * tree,zfs_btree_core_t * node,uint32_t idx,uint32_t count,enum bt_shift_shape shape)481 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
482 uint32_t count, enum bt_shift_shape shape)
483 {
484 bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
485 }
486
487 /*
488 * Shift elements and children in the provided leaf node by off spots.
489 * The first element moved is idx, and count elements are moved. The direction
490 * is determined by left.
491 */
492 static inline void
bt_shift_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * node,uint32_t idx,uint32_t count,uint32_t off,enum bt_shift_direction dir)493 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
494 uint32_t count, uint32_t off, enum bt_shift_direction dir)
495 {
496 size_t size = tree->bt_elem_size;
497 zfs_btree_hdr_t *hdr = &node->btl_hdr;
498 ASSERT(!zfs_btree_is_core(hdr));
499
500 if (count == 0)
501 return;
502 uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
503 uint8_t *out = (dir == BSD_LEFT ? start - off * size :
504 start + off * size);
505 bmov(start, out, count * size);
506 }
507
508 /*
509 * Grow leaf for n new elements before idx.
510 */
511 static void
bt_grow_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)512 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
513 uint32_t n)
514 {
515 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
516 ASSERT(!zfs_btree_is_core(hdr));
517 ASSERT3U(idx, <=, hdr->bth_count);
518 uint32_t capacity = tree->bt_leaf_cap;
519 ASSERT3U(hdr->bth_count + n, <=, capacity);
520 boolean_t cl = (hdr->bth_first >= n);
521 boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
522
523 if (cl && (!cr || idx <= hdr->bth_count / 2)) {
524 /* Grow left. */
525 hdr->bth_first -= n;
526 bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
527 } else if (cr) {
528 /* Grow right. */
529 bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
530 BSD_RIGHT);
531 } else {
532 /* Grow both ways. */
533 uint32_t fn = hdr->bth_first -
534 (capacity - (hdr->bth_count + n)) / 2;
535 hdr->bth_first -= fn;
536 bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
537 bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
538 n - fn, BSD_RIGHT);
539 }
540 hdr->bth_count += n;
541 }
542
543 /*
544 * Shrink leaf for count elements starting from idx.
545 */
546 static void
bt_shrink_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,uint32_t n)547 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
548 uint32_t n)
549 {
550 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
551 ASSERT(!zfs_btree_is_core(hdr));
552 ASSERT3U(idx, <=, hdr->bth_count);
553 ASSERT3U(idx + n, <=, hdr->bth_count);
554
555 if (idx <= (hdr->bth_count - n) / 2) {
556 bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
557 zfs_btree_poison_node_at(tree, hdr, 0, n);
558 hdr->bth_first += n;
559 } else {
560 bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
561 BSD_LEFT);
562 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
563 }
564 hdr->bth_count -= n;
565 }
566
567 /*
568 * Move children and elements from one core node to another. The shape
569 * parameter behaves the same as it does in the shift logic.
570 */
571 static inline void
bt_transfer_core(zfs_btree_t * tree,zfs_btree_core_t * source,uint32_t sidx,uint32_t count,zfs_btree_core_t * dest,uint32_t didx,enum bt_shift_shape shape)572 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
573 uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
574 enum bt_shift_shape shape)
575 {
576 size_t size = tree->bt_elem_size;
577 ASSERT(zfs_btree_is_core(&source->btc_hdr));
578 ASSERT(zfs_btree_is_core(&dest->btc_hdr));
579
580 bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
581 count * size);
582
583 uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
584 bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
585 dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
586 c_count * sizeof (*source->btc_children));
587 }
588
589 static inline void
bt_transfer_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * source,uint32_t sidx,uint32_t count,zfs_btree_leaf_t * dest,uint32_t didx)590 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
591 uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
592 {
593 size_t size = tree->bt_elem_size;
594 ASSERT(!zfs_btree_is_core(&source->btl_hdr));
595 ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
596
597 bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
598 dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
599 count * size);
600 }
601
602 /*
603 * Find the first element in the subtree rooted at hdr, return its value and
604 * put its location in where if non-null.
605 */
606 static void *
zfs_btree_first_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)607 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
608 zfs_btree_index_t *where)
609 {
610 zfs_btree_hdr_t *node;
611
612 for (node = hdr; zfs_btree_is_core(node);
613 node = ((zfs_btree_core_t *)node)->btc_children[0])
614 ;
615
616 ASSERT(!zfs_btree_is_core(node));
617 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
618 if (where != NULL) {
619 where->bti_node = node;
620 where->bti_offset = 0;
621 where->bti_before = B_FALSE;
622 }
623 return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
624 }
625
626 /* Insert an element and a child into a core node at the given offset. */
627 static void
zfs_btree_insert_core_impl(zfs_btree_t * tree,zfs_btree_core_t * parent,uint32_t offset,zfs_btree_hdr_t * new_node,void * buf)628 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
629 uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
630 {
631 size_t size = tree->bt_elem_size;
632 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
633 ASSERT3P(par_hdr, ==, new_node->bth_parent);
634 ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
635
636 if (zfs_btree_verify_intensity >= 5) {
637 zfs_btree_verify_poison_at(tree, par_hdr,
638 par_hdr->bth_count);
639 }
640 /* Shift existing elements and children */
641 uint32_t count = par_hdr->bth_count - offset;
642 bt_shift_core_right(tree, parent, offset, count,
643 BSS_PARALLELOGRAM);
644
645 /* Insert new values */
646 parent->btc_children[offset + 1] = new_node;
647 bcpy(buf, parent->btc_elems + offset * size, size);
648 par_hdr->bth_count++;
649 }
650
651 /*
652 * Insert new_node into the parent of old_node directly after old_node, with
653 * buf as the dividing element between the two.
654 */
655 static void
zfs_btree_insert_into_parent(zfs_btree_t * tree,zfs_btree_hdr_t * old_node,zfs_btree_hdr_t * new_node,void * buf)656 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
657 zfs_btree_hdr_t *new_node, void *buf)
658 {
659 ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
660 size_t size = tree->bt_elem_size;
661 zfs_btree_core_t *parent = old_node->bth_parent;
662
663 /*
664 * If this is the root node we were splitting, we create a new root
665 * and increase the height of the tree.
666 */
667 if (parent == NULL) {
668 ASSERT3P(old_node, ==, tree->bt_root);
669 tree->bt_num_nodes++;
670 zfs_btree_core_t *new_root =
671 kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
672 size, KM_SLEEP);
673 zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
674 new_root_hdr->bth_parent = NULL;
675 new_root_hdr->bth_first = -1;
676 new_root_hdr->bth_count = 1;
677
678 old_node->bth_parent = new_node->bth_parent = new_root;
679 new_root->btc_children[0] = old_node;
680 new_root->btc_children[1] = new_node;
681 bcpy(buf, new_root->btc_elems, size);
682
683 tree->bt_height++;
684 tree->bt_root = new_root_hdr;
685 zfs_btree_poison_node(tree, new_root_hdr);
686 return;
687 }
688
689 /*
690 * Since we have the new separator, binary search for where to put
691 * new_node.
692 */
693 zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
694 zfs_btree_index_t idx;
695 ASSERT(zfs_btree_is_core(par_hdr));
696 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
697 par_hdr->bth_count, buf, &idx), ==, NULL);
698 ASSERT(idx.bti_before);
699 uint32_t offset = idx.bti_offset;
700 ASSERT3U(offset, <=, par_hdr->bth_count);
701 ASSERT3P(parent->btc_children[offset], ==, old_node);
702
703 /*
704 * If the parent isn't full, shift things to accommodate our insertions
705 * and return.
706 */
707 if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
708 zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
709 return;
710 }
711
712 /*
713 * We need to split this core node into two. Currently there are
714 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
715 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
716 * current node, and the others will be moved to the new core node.
717 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
718 * will be used as the new separator in our parent, and the others
719 * will be split among the two core nodes.
720 *
721 * Usually we will split the node in half evenly, with
722 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
723 * instead move only about a quarter of the elements (and children) to
724 * the new node. Since the average state after a long time is a 3/4
725 * full node, shortcutting directly to that state improves efficiency.
726 *
727 * We do this in two stages: first we split into two nodes, and then we
728 * reuse our existing logic to insert the new element and child.
729 */
730 uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
731 2 : 4)) - 1, 2);
732 uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
733 ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
734 tree->bt_num_nodes++;
735 zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
736 BTREE_CORE_ELEMS * size, KM_SLEEP);
737 zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
738 new_par_hdr->bth_parent = par_hdr->bth_parent;
739 new_par_hdr->bth_first = -1;
740 new_par_hdr->bth_count = move_count;
741 zfs_btree_poison_node(tree, new_par_hdr);
742
743 par_hdr->bth_count = keep_count;
744
745 bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
746 0, BSS_TRAPEZOID);
747
748 /* Store the new separator in a buffer. */
749 uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
750 bcpy(parent->btc_elems + keep_count * size, tmp_buf,
751 size);
752 zfs_btree_poison_node(tree, par_hdr);
753
754 if (offset < keep_count) {
755 /* Insert the new node into the left half */
756 zfs_btree_insert_core_impl(tree, parent, offset, new_node,
757 buf);
758
759 /*
760 * Move the new separator to the existing buffer.
761 */
762 bcpy(tmp_buf, buf, size);
763 } else if (offset > keep_count) {
764 /* Insert the new node into the right half */
765 new_node->bth_parent = new_parent;
766 zfs_btree_insert_core_impl(tree, new_parent,
767 offset - keep_count - 1, new_node, buf);
768
769 /*
770 * Move the new separator to the existing buffer.
771 */
772 bcpy(tmp_buf, buf, size);
773 } else {
774 /*
775 * Move the new separator into the right half, and replace it
776 * with buf. We also need to shift back the elements in the
777 * right half to accommodate new_node.
778 */
779 bt_shift_core_right(tree, new_parent, 0, move_count,
780 BSS_TRAPEZOID);
781 new_parent->btc_children[0] = new_node;
782 bcpy(tmp_buf, new_parent->btc_elems, size);
783 new_par_hdr->bth_count++;
784 }
785 kmem_free(tmp_buf, size);
786 zfs_btree_poison_node(tree, par_hdr);
787
788 for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
789 new_parent->btc_children[i]->bth_parent = new_parent;
790
791 for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
792 ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
793
794 /*
795 * Now that the node is split, we need to insert the new node into its
796 * parent. This may cause further splitting.
797 */
798 zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
799 &new_parent->btc_hdr, buf);
800 }
801
802 /* Insert an element into a leaf node at the given offset. */
803 static void
zfs_btree_insert_leaf_impl(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,uint32_t idx,const void * value)804 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
805 uint32_t idx, const void *value)
806 {
807 size_t size = tree->bt_elem_size;
808 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
809 ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
810
811 if (zfs_btree_verify_intensity >= 5) {
812 zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
813 leaf->btl_hdr.bth_count);
814 }
815
816 bt_grow_leaf(tree, leaf, idx, 1);
817 uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
818 bcpy(value, start, size);
819 }
820
821 static void
822 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
823
824 /* Helper function for inserting a new value into leaf at the given index. */
825 static void
zfs_btree_insert_into_leaf(zfs_btree_t * tree,zfs_btree_leaf_t * leaf,const void * value,uint32_t idx)826 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
827 const void *value, uint32_t idx)
828 {
829 size_t size = tree->bt_elem_size;
830 uint32_t capacity = tree->bt_leaf_cap;
831
832 /*
833 * If the leaf isn't full, shift the elements after idx and insert
834 * value.
835 */
836 if (leaf->btl_hdr.bth_count != capacity) {
837 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
838 return;
839 }
840
841 /*
842 * Otherwise, we split the leaf node into two nodes. If we're not bulk
843 * inserting, each is of size (capacity / 2). If we are bulk
844 * inserting, we move a quarter of the elements to the new node so
845 * inserts into the old node don't cause immediate splitting but the
846 * tree stays relatively dense. Since the average state after a long
847 * time is a 3/4 full node, shortcutting directly to that state
848 * improves efficiency. At the end of the bulk insertion process
849 * we'll need to go through and fix up any nodes (the last leaf and
850 * its ancestors, potentially) that are below the minimum.
851 *
852 * In either case, we're left with one extra element. The leftover
853 * element will become the new dividing element between the two nodes.
854 */
855 uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
856 uint32_t keep_count = capacity - move_count - 1;
857 ASSERT3U(keep_count, >=, 1);
858 /* If we insert on left. move one more to keep leaves balanced. */
859 if (idx < keep_count) {
860 keep_count--;
861 move_count++;
862 }
863 tree->bt_num_nodes++;
864 zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
865 zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
866 new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
867 new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
868 (idx >= keep_count && idx <= keep_count + move_count / 2);
869 new_hdr->bth_count = move_count;
870 zfs_btree_poison_node(tree, new_hdr);
871
872 if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
873 tree->bt_bulk = new_leaf;
874
875 /* Copy the back part to the new leaf. */
876 bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
877
878 /* We store the new separator in a buffer we control for simplicity. */
879 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
880 bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
881 buf, size);
882
883 bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
884
885 if (idx < keep_count) {
886 /* Insert into the existing leaf. */
887 zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
888 } else if (idx > keep_count) {
889 /* Insert into the new leaf. */
890 zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
891 1, value);
892 } else {
893 /*
894 * Insert planned separator into the new leaf, and use
895 * the new value as the new separator.
896 */
897 zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
898 bcpy(value, buf, size);
899 }
900
901 /*
902 * Now that the node is split, we need to insert the new node into its
903 * parent. This may cause further splitting, bur only of core nodes.
904 */
905 zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
906 buf);
907 kmem_free(buf, size);
908 }
909
910 static uint32_t
zfs_btree_find_parent_idx(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)911 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
912 {
913 void *buf;
914 if (zfs_btree_is_core(hdr)) {
915 buf = ((zfs_btree_core_t *)hdr)->btc_elems;
916 } else {
917 buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
918 hdr->bth_first * tree->bt_elem_size;
919 }
920 zfs_btree_index_t idx;
921 zfs_btree_core_t *parent = hdr->bth_parent;
922 VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
923 parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
924 ASSERT(idx.bti_before);
925 ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
926 ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
927 return (idx.bti_offset);
928 }
929
930 /*
931 * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
932 * nodes may violate the invariant that non-root nodes must be at least half
933 * full. All nodes violating this invariant should be the last node in their
934 * particular level. To correct the invariant, we take values from their left
935 * neighbor until they are half full. They must have a left neighbor at their
936 * level because the last node at a level is not the first node unless it's
937 * the root.
938 */
939 static void
zfs_btree_bulk_finish(zfs_btree_t * tree)940 zfs_btree_bulk_finish(zfs_btree_t *tree)
941 {
942 ASSERT3P(tree->bt_bulk, !=, NULL);
943 ASSERT3P(tree->bt_root, !=, NULL);
944 zfs_btree_leaf_t *leaf = tree->bt_bulk;
945 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
946 zfs_btree_core_t *parent = hdr->bth_parent;
947 size_t size = tree->bt_elem_size;
948 uint32_t capacity = tree->bt_leaf_cap;
949
950 /*
951 * The invariant doesn't apply to the root node, if that's the only
952 * node in the tree we're done.
953 */
954 if (parent == NULL) {
955 tree->bt_bulk = NULL;
956 return;
957 }
958
959 /* First, take elements to rebalance the leaf node. */
960 if (hdr->bth_count < capacity / 2) {
961 /*
962 * First, find the left neighbor. The simplest way to do this
963 * is to call zfs_btree_prev twice; the first time finds some
964 * ancestor of this node, and the second time finds the left
965 * neighbor. The ancestor found is the lowest common ancestor
966 * of leaf and the neighbor.
967 */
968 zfs_btree_index_t idx = {
969 .bti_node = hdr,
970 .bti_offset = 0
971 };
972 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
973 ASSERT(zfs_btree_is_core(idx.bti_node));
974 zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
975 uint32_t common_idx = idx.bti_offset;
976
977 VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
978 ASSERT(!zfs_btree_is_core(idx.bti_node));
979 zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
980 zfs_btree_hdr_t *l_hdr = idx.bti_node;
981 uint32_t move_count = (capacity / 2) - hdr->bth_count;
982 ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
983 capacity / 2);
984
985 if (zfs_btree_verify_intensity >= 5) {
986 for (uint32_t i = 0; i < move_count; i++) {
987 zfs_btree_verify_poison_at(tree, hdr,
988 leaf->btl_hdr.bth_count + i);
989 }
990 }
991
992 /* First, shift elements in leaf back. */
993 bt_grow_leaf(tree, leaf, 0, move_count);
994
995 /* Next, move the separator from the common ancestor to leaf. */
996 uint8_t *separator = common->btc_elems + common_idx * size;
997 uint8_t *out = leaf->btl_elems +
998 (hdr->bth_first + move_count - 1) * size;
999 bcpy(separator, out, size);
1000
1001 /*
1002 * Now we move elements from the tail of the left neighbor to
1003 * fill the remaining spots in leaf.
1004 */
1005 bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
1006 (move_count - 1), move_count - 1, leaf, 0);
1007
1008 /*
1009 * Finally, move the new last element in the left neighbor to
1010 * the separator.
1011 */
1012 bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
1013 l_hdr->bth_count - move_count) * size, separator, size);
1014
1015 /* Adjust the node's counts, and we're done. */
1016 bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
1017 move_count);
1018
1019 ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
1020 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1021 }
1022
1023 /*
1024 * Now we have to rebalance any ancestors of leaf that may also
1025 * violate the invariant.
1026 */
1027 capacity = BTREE_CORE_ELEMS;
1028 while (parent->btc_hdr.bth_parent != NULL) {
1029 zfs_btree_core_t *cur = parent;
1030 zfs_btree_hdr_t *hdr = &cur->btc_hdr;
1031 parent = hdr->bth_parent;
1032 /*
1033 * If the invariant isn't violated, move on to the next
1034 * ancestor.
1035 */
1036 if (hdr->bth_count >= capacity / 2)
1037 continue;
1038
1039 /*
1040 * Because the smallest number of nodes we can move when
1041 * splitting is 2, we never need to worry about not having a
1042 * left sibling (a sibling is a neighbor with the same parent).
1043 */
1044 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1045 ASSERT3U(parent_idx, >, 0);
1046 zfs_btree_core_t *l_neighbor =
1047 (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
1048 uint32_t move_count = (capacity / 2) - hdr->bth_count;
1049 ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
1050 capacity / 2);
1051
1052 if (zfs_btree_verify_intensity >= 5) {
1053 for (uint32_t i = 0; i < move_count; i++) {
1054 zfs_btree_verify_poison_at(tree, hdr,
1055 hdr->bth_count + i);
1056 }
1057 }
1058 /* First, shift things in the right node back. */
1059 bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
1060 BSS_TRAPEZOID, BSD_RIGHT);
1061
1062 /* Next, move the separator to the right node. */
1063 uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
1064 size);
1065 uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
1066 bcpy(separator, e_out, size);
1067
1068 /*
1069 * Now, move elements and children from the left node to the
1070 * right. We move one more child than elements.
1071 */
1072 move_count--;
1073 uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
1074 bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
1075 BSS_TRAPEZOID);
1076
1077 /*
1078 * Finally, move the last element in the left node to the
1079 * separator's position.
1080 */
1081 move_idx--;
1082 bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
1083
1084 l_neighbor->btc_hdr.bth_count -= move_count + 1;
1085 hdr->bth_count += move_count + 1;
1086
1087 ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
1088 ASSERT3U(hdr->bth_count, >=, capacity / 2);
1089
1090 zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
1091
1092 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1093 cur->btc_children[i]->bth_parent = cur;
1094 }
1095
1096 tree->bt_bulk = NULL;
1097 zfs_btree_verify(tree);
1098 }
1099
1100 /*
1101 * Insert value into tree at the location specified by where.
1102 */
1103 void
zfs_btree_add_idx(zfs_btree_t * tree,const void * value,const zfs_btree_index_t * where)1104 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
1105 const zfs_btree_index_t *where)
1106 {
1107 zfs_btree_index_t idx = {0};
1108
1109 /* If we're not inserting in the last leaf, end bulk insert mode. */
1110 if (tree->bt_bulk != NULL) {
1111 if (where->bti_node != &tree->bt_bulk->btl_hdr) {
1112 zfs_btree_bulk_finish(tree);
1113 VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
1114 where = &idx;
1115 }
1116 }
1117
1118 tree->bt_num_elems++;
1119 /*
1120 * If this is the first element in the tree, create a leaf root node
1121 * and add the value to it.
1122 */
1123 if (where->bti_node == NULL) {
1124 ASSERT3U(tree->bt_num_elems, ==, 1);
1125 ASSERT3S(tree->bt_height, ==, -1);
1126 ASSERT0P(tree->bt_root);
1127 ASSERT0(where->bti_offset);
1128
1129 tree->bt_num_nodes++;
1130 zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
1131 tree->bt_root = &leaf->btl_hdr;
1132 tree->bt_height++;
1133
1134 zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
1135 hdr->bth_parent = NULL;
1136 hdr->bth_first = 0;
1137 hdr->bth_count = 0;
1138 zfs_btree_poison_node(tree, hdr);
1139
1140 zfs_btree_insert_into_leaf(tree, leaf, value, 0);
1141 tree->bt_bulk = leaf;
1142 } else if (!zfs_btree_is_core(where->bti_node)) {
1143 /*
1144 * If we're inserting into a leaf, go directly to the helper
1145 * function.
1146 */
1147 zfs_btree_insert_into_leaf(tree,
1148 (zfs_btree_leaf_t *)where->bti_node, value,
1149 where->bti_offset);
1150 } else {
1151 /*
1152 * If we're inserting into a core node, we can't just shift
1153 * the existing element in that slot in the same node without
1154 * breaking our ordering invariants. Instead we place the new
1155 * value in the node at that spot and then insert the old
1156 * separator into the first slot in the subtree to the right.
1157 */
1158 zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
1159
1160 /*
1161 * We can ignore bti_before, because either way the value
1162 * should end up in bti_offset.
1163 */
1164 uint32_t off = where->bti_offset;
1165 zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
1166 size_t size = tree->bt_elem_size;
1167 uint8_t *buf = kmem_alloc(size, KM_SLEEP);
1168 bcpy(node->btc_elems + off * size, buf, size);
1169 bcpy(value, node->btc_elems + off * size, size);
1170
1171 /*
1172 * Find the first slot in the subtree to the right, insert
1173 * there.
1174 */
1175 zfs_btree_index_t new_idx;
1176 VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
1177 NULL);
1178 ASSERT0(new_idx.bti_offset);
1179 ASSERT(!zfs_btree_is_core(new_idx.bti_node));
1180 zfs_btree_insert_into_leaf(tree,
1181 (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
1182 kmem_free(buf, size);
1183 }
1184 zfs_btree_verify(tree);
1185 }
1186
1187 /*
1188 * Return the first element in the tree, and put its location in where if
1189 * non-null.
1190 */
1191 void *
zfs_btree_first(zfs_btree_t * tree,zfs_btree_index_t * where)1192 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
1193 {
1194 if (tree->bt_height == -1) {
1195 ASSERT0(tree->bt_num_elems);
1196 return (NULL);
1197 }
1198 return (zfs_btree_first_helper(tree, tree->bt_root, where));
1199 }
1200
1201 /*
1202 * Find the last element in the subtree rooted at hdr, return its value and
1203 * put its location in where if non-null.
1204 */
1205 static void *
zfs_btree_last_helper(zfs_btree_t * btree,zfs_btree_hdr_t * hdr,zfs_btree_index_t * where)1206 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
1207 zfs_btree_index_t *where)
1208 {
1209 zfs_btree_hdr_t *node;
1210
1211 for (node = hdr; zfs_btree_is_core(node); node =
1212 ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
1213 ;
1214
1215 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
1216 if (where != NULL) {
1217 where->bti_node = node;
1218 where->bti_offset = node->bth_count - 1;
1219 where->bti_before = B_FALSE;
1220 }
1221 return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
1222 btree->bt_elem_size);
1223 }
1224
1225 /*
1226 * Return the last element in the tree, and put its location in where if
1227 * non-null.
1228 */
1229 void *
zfs_btree_last(zfs_btree_t * tree,zfs_btree_index_t * where)1230 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
1231 {
1232 if (tree->bt_height == -1) {
1233 ASSERT0(tree->bt_num_elems);
1234 return (NULL);
1235 }
1236 return (zfs_btree_last_helper(tree, tree->bt_root, where));
1237 }
1238
1239 /*
1240 * This function contains the logic to find the next node in the tree. A
1241 * helper function is used because there are multiple internal consumemrs of
1242 * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
1243 * node after we've finished with it.
1244 */
1245 static void *
zfs_btree_next_helper(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx,void (* done_func)(zfs_btree_t *,zfs_btree_hdr_t *))1246 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1247 zfs_btree_index_t *out_idx,
1248 void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
1249 {
1250 if (idx->bti_node == NULL) {
1251 ASSERT3S(tree->bt_height, ==, -1);
1252 return (NULL);
1253 }
1254
1255 uint32_t offset = idx->bti_offset;
1256 if (!zfs_btree_is_core(idx->bti_node)) {
1257 /*
1258 * When finding the next element of an element in a leaf,
1259 * there are two cases. If the element isn't the last one in
1260 * the leaf, in which case we just return the next element in
1261 * the leaf. Otherwise, we need to traverse up our parents
1262 * until we find one where our ancestor isn't the last child
1263 * of its parent. Once we do, the next element is the
1264 * separator after our ancestor in its parent.
1265 */
1266 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1267 uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
1268 if (leaf->btl_hdr.bth_count > new_off) {
1269 out_idx->bti_node = &leaf->btl_hdr;
1270 out_idx->bti_offset = new_off;
1271 out_idx->bti_before = B_FALSE;
1272 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1273 new_off) * tree->bt_elem_size);
1274 }
1275
1276 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1277 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1278 node != NULL; node = node->btc_hdr.bth_parent) {
1279 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1280 ASSERT(zfs_btree_is_core(hdr));
1281 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1282 if (done_func != NULL)
1283 done_func(tree, prev);
1284 if (i == hdr->bth_count) {
1285 prev = hdr;
1286 continue;
1287 }
1288 out_idx->bti_node = hdr;
1289 out_idx->bti_offset = i;
1290 out_idx->bti_before = B_FALSE;
1291 return (node->btc_elems + i * tree->bt_elem_size);
1292 }
1293 if (done_func != NULL)
1294 done_func(tree, prev);
1295 /*
1296 * We've traversed all the way up and been at the end of the
1297 * node every time, so this was the last element in the tree.
1298 */
1299 return (NULL);
1300 }
1301
1302 /* If we were before an element in a core node, return that element. */
1303 ASSERT(zfs_btree_is_core(idx->bti_node));
1304 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1305 if (idx->bti_before) {
1306 out_idx->bti_before = B_FALSE;
1307 return (node->btc_elems + offset * tree->bt_elem_size);
1308 }
1309
1310 /*
1311 * The next element from one in a core node is the first element in
1312 * the subtree just to the right of the separator.
1313 */
1314 zfs_btree_hdr_t *child = node->btc_children[offset + 1];
1315 return (zfs_btree_first_helper(tree, child, out_idx));
1316 }
1317
1318 /*
1319 * Return the next valued node in the tree. The same address can be safely
1320 * passed for idx and out_idx.
1321 */
1322 void *
zfs_btree_next(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1323 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1324 zfs_btree_index_t *out_idx)
1325 {
1326 return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
1327 }
1328
1329 /*
1330 * Return the previous valued node in the tree. The same value can be safely
1331 * passed for idx and out_idx.
1332 */
1333 void *
zfs_btree_prev(zfs_btree_t * tree,const zfs_btree_index_t * idx,zfs_btree_index_t * out_idx)1334 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
1335 zfs_btree_index_t *out_idx)
1336 {
1337 if (idx->bti_node == NULL) {
1338 ASSERT3S(tree->bt_height, ==, -1);
1339 return (NULL);
1340 }
1341
1342 uint32_t offset = idx->bti_offset;
1343 if (!zfs_btree_is_core(idx->bti_node)) {
1344 /*
1345 * When finding the previous element of an element in a leaf,
1346 * there are two cases. If the element isn't the first one in
1347 * the leaf, in which case we just return the previous element
1348 * in the leaf. Otherwise, we need to traverse up our parents
1349 * until we find one where our previous ancestor isn't the
1350 * first child. Once we do, the previous element is the
1351 * separator after our previous ancestor.
1352 */
1353 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1354 if (offset != 0) {
1355 out_idx->bti_node = &leaf->btl_hdr;
1356 out_idx->bti_offset = offset - 1;
1357 out_idx->bti_before = B_FALSE;
1358 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1359 offset - 1) * tree->bt_elem_size);
1360 }
1361 zfs_btree_hdr_t *prev = &leaf->btl_hdr;
1362 for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
1363 node != NULL; node = node->btc_hdr.bth_parent) {
1364 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1365 ASSERT(zfs_btree_is_core(hdr));
1366 uint32_t i = zfs_btree_find_parent_idx(tree, prev);
1367 if (i == 0) {
1368 prev = hdr;
1369 continue;
1370 }
1371 out_idx->bti_node = hdr;
1372 out_idx->bti_offset = i - 1;
1373 out_idx->bti_before = B_FALSE;
1374 return (node->btc_elems + (i - 1) * tree->bt_elem_size);
1375 }
1376 /*
1377 * We've traversed all the way up and been at the start of the
1378 * node every time, so this was the first node in the tree.
1379 */
1380 return (NULL);
1381 }
1382
1383 /*
1384 * The previous element from one in a core node is the last element in
1385 * the subtree just to the left of the separator.
1386 */
1387 ASSERT(zfs_btree_is_core(idx->bti_node));
1388 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1389 zfs_btree_hdr_t *child = node->btc_children[offset];
1390 return (zfs_btree_last_helper(tree, child, out_idx));
1391 }
1392
1393 /*
1394 * Get the value at the provided index in the tree.
1395 *
1396 * Note that the value returned from this function can be mutated, but only
1397 * if it will not change the ordering of the element with respect to any other
1398 * elements that could be in the tree.
1399 */
1400 void *
zfs_btree_get(zfs_btree_t * tree,zfs_btree_index_t * idx)1401 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
1402 {
1403 ASSERT(!idx->bti_before);
1404 size_t size = tree->bt_elem_size;
1405 if (!zfs_btree_is_core(idx->bti_node)) {
1406 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
1407 return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
1408 idx->bti_offset) * size);
1409 }
1410 zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
1411 return (node->btc_elems + idx->bti_offset * size);
1412 }
1413
1414 /* Add the given value to the tree. Must not already be in the tree. */
1415 void
zfs_btree_add(zfs_btree_t * tree,const void * node)1416 zfs_btree_add(zfs_btree_t *tree, const void *node)
1417 {
1418 zfs_btree_index_t where = {0};
1419 VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
1420 zfs_btree_add_idx(tree, node, &where);
1421 }
1422
1423 /* Helper function to free a tree node. */
1424 static void
zfs_btree_node_destroy(zfs_btree_t * tree,zfs_btree_hdr_t * node)1425 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
1426 {
1427 tree->bt_num_nodes--;
1428 if (!zfs_btree_is_core(node)) {
1429 zfs_btree_leaf_free(tree, node);
1430 } else {
1431 kmem_free(node, sizeof (zfs_btree_core_t) +
1432 BTREE_CORE_ELEMS * tree->bt_elem_size);
1433 }
1434 }
1435
1436 /*
1437 * Remove the rm_hdr and the separator to its left from the parent node. The
1438 * buffer that rm_hdr was stored in may already be freed, so its contents
1439 * cannot be accessed.
1440 */
1441 static void
zfs_btree_remove_from_node(zfs_btree_t * tree,zfs_btree_core_t * node,zfs_btree_hdr_t * rm_hdr)1442 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
1443 zfs_btree_hdr_t *rm_hdr)
1444 {
1445 size_t size = tree->bt_elem_size;
1446 uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
1447 zfs_btree_hdr_t *hdr = &node->btc_hdr;
1448 /*
1449 * If the node is the root node and rm_hdr is one of two children,
1450 * promote the other child to the root.
1451 */
1452 if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
1453 ASSERT3U(hdr->bth_count, ==, 1);
1454 ASSERT3P(tree->bt_root, ==, node);
1455 ASSERT3P(node->btc_children[1], ==, rm_hdr);
1456 tree->bt_root = node->btc_children[0];
1457 node->btc_children[0]->bth_parent = NULL;
1458 zfs_btree_node_destroy(tree, hdr);
1459 tree->bt_height--;
1460 return;
1461 }
1462
1463 uint32_t idx;
1464 for (idx = 0; idx <= hdr->bth_count; idx++) {
1465 if (node->btc_children[idx] == rm_hdr)
1466 break;
1467 }
1468 ASSERT3U(idx, <=, hdr->bth_count);
1469
1470 /*
1471 * If the node is the root or it has more than the minimum number of
1472 * children, just remove the child and separator, and return.
1473 */
1474 if (hdr->bth_parent == NULL ||
1475 hdr->bth_count > min_count) {
1476 /*
1477 * Shift the element and children to the right of rm_hdr to
1478 * the left by one spot.
1479 */
1480 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1481 BSS_PARALLELOGRAM);
1482 hdr->bth_count--;
1483 zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
1484 return;
1485 }
1486
1487 ASSERT3U(hdr->bth_count, ==, min_count);
1488
1489 /*
1490 * Now we try to take a node from a neighbor. We check left, then
1491 * right. If the neighbor exists and has more than the minimum number
1492 * of elements, we move the separator between us and them to our
1493 * node, move their closest element (last for left, first for right)
1494 * to the separator, and move their closest child to our node. Along
1495 * the way we need to collapse the gap made by idx, and (for our right
1496 * neighbor) the gap made by removing their first element and child.
1497 *
1498 * Note: this logic currently doesn't support taking from a neighbor
1499 * that isn't a sibling (i.e. a neighbor with a different
1500 * parent). This isn't critical functionality, but may be worth
1501 * implementing in the future for completeness' sake.
1502 */
1503 zfs_btree_core_t *parent = hdr->bth_parent;
1504 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1505
1506 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1507 parent->btc_children[parent_idx - 1]);
1508 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1509 /* We can take a node from the left neighbor. */
1510 ASSERT(zfs_btree_is_core(l_hdr));
1511 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
1512
1513 /*
1514 * Start by shifting the elements and children in the current
1515 * node to the right by one spot.
1516 */
1517 bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
1518
1519 /*
1520 * Move the separator between node and neighbor to the first
1521 * element slot in the current node.
1522 */
1523 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1524 size;
1525 bcpy(separator, node->btc_elems, size);
1526
1527 /* Move the last child of neighbor to our first child slot. */
1528 node->btc_children[0] =
1529 neighbor->btc_children[l_hdr->bth_count];
1530 node->btc_children[0]->bth_parent = node;
1531
1532 /* Move the last element of neighbor to the separator spot. */
1533 uint8_t *take_elem = neighbor->btc_elems +
1534 (l_hdr->bth_count - 1) * size;
1535 bcpy(take_elem, separator, size);
1536 l_hdr->bth_count--;
1537 zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
1538 return;
1539 }
1540
1541 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1542 NULL : parent->btc_children[parent_idx + 1]);
1543 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1544 /* We can take a node from the right neighbor. */
1545 ASSERT(zfs_btree_is_core(r_hdr));
1546 zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
1547
1548 /*
1549 * Shift elements in node left by one spot to overwrite rm_hdr
1550 * and the separator before it.
1551 */
1552 bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
1553 BSS_PARALLELOGRAM);
1554
1555 /*
1556 * Move the separator between node and neighbor to the last
1557 * element spot in node.
1558 */
1559 uint8_t *separator = parent->btc_elems + parent_idx * size;
1560 bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
1561 size);
1562
1563 /*
1564 * Move the first child of neighbor to the last child spot in
1565 * node.
1566 */
1567 node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
1568 node->btc_children[hdr->bth_count]->bth_parent = node;
1569
1570 /* Move the first element of neighbor to the separator spot. */
1571 uint8_t *take_elem = neighbor->btc_elems;
1572 bcpy(take_elem, separator, size);
1573 r_hdr->bth_count--;
1574
1575 /*
1576 * Shift the elements and children of neighbor to cover the
1577 * stolen elements.
1578 */
1579 bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
1580 BSS_TRAPEZOID);
1581 zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
1582 return;
1583 }
1584
1585 /*
1586 * In this case, neither of our neighbors can spare an element, so we
1587 * need to merge with one of them. We prefer the left one,
1588 * arbitrarily. Move the separator into the leftmost merging node
1589 * (which may be us or the left neighbor), and then move the right
1590 * merging node's elements. Once that's done, we go back and delete
1591 * the element we're removing. Finally, go into the parent and delete
1592 * the right merging node and the separator. This may cause further
1593 * merging.
1594 */
1595 zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
1596 uint32_t new_idx = idx;
1597 if (l_hdr != NULL) {
1598 keep_hdr = l_hdr;
1599 new_rm_hdr = hdr;
1600 new_idx += keep_hdr->bth_count + 1;
1601 } else {
1602 ASSERT3P(r_hdr, !=, NULL);
1603 keep_hdr = hdr;
1604 new_rm_hdr = r_hdr;
1605 parent_idx++;
1606 }
1607
1608 ASSERT(zfs_btree_is_core(keep_hdr));
1609 ASSERT(zfs_btree_is_core(new_rm_hdr));
1610
1611 zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
1612 zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
1613
1614 if (zfs_btree_verify_intensity >= 5) {
1615 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
1616 zfs_btree_verify_poison_at(tree, keep_hdr,
1617 keep_hdr->bth_count + i);
1618 }
1619 }
1620
1621 /* Move the separator into the left node. */
1622 uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
1623 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1624 size;
1625 bcpy(separator, e_out, size);
1626 keep_hdr->bth_count++;
1627
1628 /* Move all our elements and children into the left node. */
1629 bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
1630 keep_hdr->bth_count, BSS_TRAPEZOID);
1631
1632 uint32_t old_count = keep_hdr->bth_count;
1633
1634 /* Update bookkeeping */
1635 keep_hdr->bth_count += new_rm_hdr->bth_count;
1636 ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
1637
1638 /*
1639 * Shift the element and children to the right of rm_hdr to
1640 * the left by one spot.
1641 */
1642 ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
1643 bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
1644 BSS_PARALLELOGRAM);
1645 keep_hdr->bth_count--;
1646
1647 /* Reparent all our children to point to the left node. */
1648 zfs_btree_hdr_t **new_start = keep->btc_children +
1649 old_count - 1;
1650 for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
1651 new_start[i]->bth_parent = keep;
1652 for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
1653 ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
1654 ASSERT3P(keep->btc_children[i], !=, rm_hdr);
1655 }
1656 zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
1657
1658 new_rm_hdr->bth_count = 0;
1659 zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
1660 zfs_btree_node_destroy(tree, new_rm_hdr);
1661 }
1662
1663 /* Remove the element at the specific location. */
1664 void
zfs_btree_remove_idx(zfs_btree_t * tree,zfs_btree_index_t * where)1665 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
1666 {
1667 size_t size = tree->bt_elem_size;
1668 zfs_btree_hdr_t *hdr = where->bti_node;
1669 uint32_t idx = where->bti_offset;
1670
1671 ASSERT(!where->bti_before);
1672 if (tree->bt_bulk != NULL) {
1673 /*
1674 * Leave bulk insert mode. Note that our index would be
1675 * invalid after we correct the tree, so we copy the value
1676 * we're planning to remove and find it again after
1677 * bulk_finish.
1678 */
1679 uint8_t *value = zfs_btree_get(tree, where);
1680 uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
1681 bcpy(value, tmp, size);
1682 zfs_btree_bulk_finish(tree);
1683 VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
1684 kmem_free(tmp, size);
1685 hdr = where->bti_node;
1686 idx = where->bti_offset;
1687 }
1688
1689 tree->bt_num_elems--;
1690 /*
1691 * If the element happens to be in a core node, we move a leaf node's
1692 * element into its place and then remove the leaf node element. This
1693 * makes the rebalance logic not need to be recursive both upwards and
1694 * downwards.
1695 */
1696 if (zfs_btree_is_core(hdr)) {
1697 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1698 zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
1699 void *new_value = zfs_btree_last_helper(tree, left_subtree,
1700 where);
1701 ASSERT3P(new_value, !=, NULL);
1702
1703 bcpy(new_value, node->btc_elems + idx * size, size);
1704
1705 hdr = where->bti_node;
1706 idx = where->bti_offset;
1707 ASSERT(!where->bti_before);
1708 }
1709
1710 /*
1711 * First, we'll update the leaf's metadata. Then, we shift any
1712 * elements after the idx to the left. After that, we rebalance if
1713 * needed.
1714 */
1715 ASSERT(!zfs_btree_is_core(hdr));
1716 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
1717 ASSERT3U(hdr->bth_count, >, 0);
1718
1719 uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
1720
1721 /*
1722 * If we're over the minimum size or this is the root, just overwrite
1723 * the value and return.
1724 */
1725 if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
1726 bt_shrink_leaf(tree, leaf, idx, 1);
1727 if (hdr->bth_parent == NULL) {
1728 ASSERT0(tree->bt_height);
1729 if (hdr->bth_count == 0) {
1730 tree->bt_root = NULL;
1731 tree->bt_height--;
1732 zfs_btree_node_destroy(tree, &leaf->btl_hdr);
1733 }
1734 }
1735 zfs_btree_verify(tree);
1736 return;
1737 }
1738 ASSERT3U(hdr->bth_count, ==, min_count);
1739
1740 /*
1741 * Now we try to take a node from a sibling. We check left, then
1742 * right. If they exist and have more than the minimum number of
1743 * elements, we move the separator between us and them to our node
1744 * and move their closest element (last for left, first for right) to
1745 * the separator. Along the way we need to collapse the gap made by
1746 * idx, and (for our right neighbor) the gap made by removing their
1747 * first element.
1748 *
1749 * Note: this logic currently doesn't support taking from a neighbor
1750 * that isn't a sibling. This isn't critical functionality, but may be
1751 * worth implementing in the future for completeness' sake.
1752 */
1753 zfs_btree_core_t *parent = hdr->bth_parent;
1754 uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
1755
1756 zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
1757 parent->btc_children[parent_idx - 1]);
1758 if (l_hdr != NULL && l_hdr->bth_count > min_count) {
1759 /* We can take a node from the left neighbor. */
1760 ASSERT(!zfs_btree_is_core(l_hdr));
1761 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
1762
1763 /*
1764 * Move our elements back by one spot to make room for the
1765 * stolen element and overwrite the element being removed.
1766 */
1767 bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
1768
1769 /* Move the separator to our first spot. */
1770 uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
1771 size;
1772 bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
1773
1774 /* Move our neighbor's last element to the separator. */
1775 uint8_t *take_elem = neighbor->btl_elems +
1776 (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
1777 bcpy(take_elem, separator, size);
1778
1779 /* Delete our neighbor's last element. */
1780 bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
1781 zfs_btree_verify(tree);
1782 return;
1783 }
1784
1785 zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
1786 NULL : parent->btc_children[parent_idx + 1]);
1787 if (r_hdr != NULL && r_hdr->bth_count > min_count) {
1788 /* We can take a node from the right neighbor. */
1789 ASSERT(!zfs_btree_is_core(r_hdr));
1790 zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
1791
1792 /*
1793 * Move our elements after the element being removed forwards
1794 * by one spot to make room for the stolen element and
1795 * overwrite the element being removed.
1796 */
1797 bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
1798 1, BSD_LEFT);
1799
1800 /* Move the separator between us to our last spot. */
1801 uint8_t *separator = parent->btc_elems + parent_idx * size;
1802 bcpy(separator, leaf->btl_elems + (hdr->bth_first +
1803 hdr->bth_count - 1) * size, size);
1804
1805 /* Move our neighbor's first element to the separator. */
1806 uint8_t *take_elem = neighbor->btl_elems +
1807 r_hdr->bth_first * size;
1808 bcpy(take_elem, separator, size);
1809
1810 /* Delete our neighbor's first element. */
1811 bt_shrink_leaf(tree, neighbor, 0, 1);
1812 zfs_btree_verify(tree);
1813 return;
1814 }
1815
1816 /*
1817 * In this case, neither of our neighbors can spare an element, so we
1818 * need to merge with one of them. We prefer the left one, arbitrarily.
1819 * After remove we move the separator into the leftmost merging node
1820 * (which may be us or the left neighbor), and then move the right
1821 * merging node's elements. Once that's done, we go back and delete
1822 * the element we're removing. Finally, go into the parent and delete
1823 * the right merging node and the separator. This may cause further
1824 * merging.
1825 */
1826 zfs_btree_hdr_t *rm_hdr, *k_hdr;
1827 if (l_hdr != NULL) {
1828 k_hdr = l_hdr;
1829 rm_hdr = hdr;
1830 } else {
1831 ASSERT3P(r_hdr, !=, NULL);
1832 k_hdr = hdr;
1833 rm_hdr = r_hdr;
1834 parent_idx++;
1835 }
1836 ASSERT(!zfs_btree_is_core(k_hdr));
1837 ASSERT(!zfs_btree_is_core(rm_hdr));
1838 ASSERT3U(k_hdr->bth_count, ==, min_count);
1839 ASSERT3U(rm_hdr->bth_count, ==, min_count);
1840 zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
1841 zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
1842
1843 if (zfs_btree_verify_intensity >= 5) {
1844 for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
1845 zfs_btree_verify_poison_at(tree, k_hdr,
1846 k_hdr->bth_count + i);
1847 }
1848 }
1849
1850 /*
1851 * Remove the value from the node. It will go below the minimum,
1852 * but we'll fix it in no time.
1853 */
1854 bt_shrink_leaf(tree, leaf, idx, 1);
1855
1856 /* Prepare space for elements to be moved from the right. */
1857 uint32_t k_count = k_hdr->bth_count;
1858 bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
1859 ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
1860
1861 /* Move the separator into the first open spot. */
1862 uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
1863 uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
1864 bcpy(separator, out, size);
1865
1866 /* Move our elements to the left neighbor. */
1867 bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
1868
1869 /* Remove the emptied node from the parent. */
1870 zfs_btree_remove_from_node(tree, parent, rm_hdr);
1871 zfs_btree_node_destroy(tree, rm_hdr);
1872 zfs_btree_verify(tree);
1873 }
1874
1875 /* Remove the given value from the tree. */
1876 void
zfs_btree_remove(zfs_btree_t * tree,const void * value)1877 zfs_btree_remove(zfs_btree_t *tree, const void *value)
1878 {
1879 zfs_btree_index_t where = {0};
1880 VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
1881 zfs_btree_remove_idx(tree, &where);
1882 }
1883
1884 /* Return the number of elements in the tree. */
1885 ulong_t
zfs_btree_numnodes(zfs_btree_t * tree)1886 zfs_btree_numnodes(zfs_btree_t *tree)
1887 {
1888 return (tree->bt_num_elems);
1889 }
1890
1891 /*
1892 * This function is used to visit all the elements in the tree before
1893 * destroying the tree. This allows the calling code to perform any cleanup it
1894 * needs to do. This is more efficient than just removing the first element
1895 * over and over, because it removes all rebalancing. Once the destroy_nodes()
1896 * function has been called, no other btree operations are valid until it
1897 * returns NULL, which point the only valid operation is zfs_btree_destroy().
1898 *
1899 * example:
1900 *
1901 * zfs_btree_index_t *cookie = NULL;
1902 * my_data_t *node;
1903 *
1904 * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
1905 * free(node->ptr);
1906 * zfs_btree_destroy(tree);
1907 *
1908 */
1909 void *
zfs_btree_destroy_nodes(zfs_btree_t * tree,zfs_btree_index_t ** cookie)1910 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
1911 {
1912 if (*cookie == NULL) {
1913 if (tree->bt_height == -1)
1914 return (NULL);
1915 *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
1916 return (zfs_btree_first(tree, *cookie));
1917 }
1918
1919 void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
1920 zfs_btree_node_destroy);
1921 if (rval == NULL) {
1922 tree->bt_root = NULL;
1923 tree->bt_height = -1;
1924 tree->bt_num_elems = 0;
1925 kmem_free(*cookie, sizeof (**cookie));
1926 tree->bt_bulk = NULL;
1927 }
1928 return (rval);
1929 }
1930
1931 static void
zfs_btree_clear_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1932 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1933 {
1934 if (zfs_btree_is_core(hdr)) {
1935 zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
1936 for (uint32_t i = 0; i <= hdr->bth_count; i++)
1937 zfs_btree_clear_helper(tree, btc->btc_children[i]);
1938 }
1939
1940 zfs_btree_node_destroy(tree, hdr);
1941 }
1942
1943 void
zfs_btree_clear(zfs_btree_t * tree)1944 zfs_btree_clear(zfs_btree_t *tree)
1945 {
1946 if (tree->bt_root == NULL) {
1947 ASSERT0(tree->bt_num_elems);
1948 return;
1949 }
1950
1951 zfs_btree_clear_helper(tree, tree->bt_root);
1952 tree->bt_num_elems = 0;
1953 tree->bt_root = NULL;
1954 tree->bt_num_nodes = 0;
1955 tree->bt_height = -1;
1956 tree->bt_bulk = NULL;
1957 }
1958
1959 void
zfs_btree_destroy(zfs_btree_t * tree)1960 zfs_btree_destroy(zfs_btree_t *tree)
1961 {
1962 ASSERT0(tree->bt_num_elems);
1963 ASSERT0P(tree->bt_root);
1964 }
1965
1966 /* Verify that every child of this node has the correct parent pointer. */
1967 static void
zfs_btree_verify_pointers_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1968 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1969 {
1970 if (!zfs_btree_is_core(hdr))
1971 return;
1972
1973 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
1974 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
1975 VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
1976 zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
1977 }
1978 }
1979
1980 /* Verify that every node has the correct parent pointer. */
1981 static void
zfs_btree_verify_pointers(zfs_btree_t * tree)1982 zfs_btree_verify_pointers(zfs_btree_t *tree)
1983 {
1984 if (tree->bt_height == -1) {
1985 VERIFY0P(tree->bt_root);
1986 return;
1987 }
1988 VERIFY0P(tree->bt_root->bth_parent);
1989 zfs_btree_verify_pointers_helper(tree, tree->bt_root);
1990 }
1991
1992 /*
1993 * Verify that all the current node and its children satisfy the count
1994 * invariants, and return the total count in the subtree rooted in this node.
1995 */
1996 static uint64_t
zfs_btree_verify_counts_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)1997 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
1998 {
1999 if (!zfs_btree_is_core(hdr)) {
2000 if (tree->bt_root != hdr && tree->bt_bulk &&
2001 hdr != &tree->bt_bulk->btl_hdr) {
2002 VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
2003 }
2004
2005 return (hdr->bth_count);
2006 } else {
2007
2008 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2009 uint64_t ret = hdr->bth_count;
2010 if (tree->bt_root != hdr && tree->bt_bulk == NULL)
2011 VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
2012 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2013 ret += zfs_btree_verify_counts_helper(tree,
2014 node->btc_children[i]);
2015 }
2016
2017 return (ret);
2018 }
2019 }
2020
2021 /*
2022 * Verify that all nodes satisfy the invariants and that the total number of
2023 * elements is correct.
2024 */
2025 static void
zfs_btree_verify_counts(zfs_btree_t * tree)2026 zfs_btree_verify_counts(zfs_btree_t *tree)
2027 {
2028 EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
2029 if (tree->bt_height == -1) {
2030 return;
2031 }
2032 VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
2033 tree->bt_num_elems);
2034 }
2035
2036 /*
2037 * Check that the subtree rooted at this node has a uniform height. Returns
2038 * the number of nodes under this node, to help verify bt_num_nodes.
2039 */
2040 static uint64_t
zfs_btree_verify_height_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr,int32_t height)2041 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
2042 int32_t height)
2043 {
2044 if (!zfs_btree_is_core(hdr)) {
2045 VERIFY0(height);
2046 return (1);
2047 }
2048
2049 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2050 uint64_t ret = 1;
2051 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2052 ret += zfs_btree_verify_height_helper(tree,
2053 node->btc_children[i], height - 1);
2054 }
2055 return (ret);
2056 }
2057
2058 /*
2059 * Check that the tree rooted at this node has a uniform height, and that the
2060 * bt_height in the tree is correct.
2061 */
2062 static void
zfs_btree_verify_height(zfs_btree_t * tree)2063 zfs_btree_verify_height(zfs_btree_t *tree)
2064 {
2065 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2066 if (tree->bt_height == -1) {
2067 return;
2068 }
2069
2070 VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
2071 tree->bt_height), ==, tree->bt_num_nodes);
2072 }
2073
2074 /*
2075 * Check that the elements in this node are sorted, and that if this is a core
2076 * node, the separators are properly between the subtrees they separaate and
2077 * that the children also satisfy this requirement.
2078 */
2079 static void
zfs_btree_verify_order_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2080 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2081 {
2082 size_t size = tree->bt_elem_size;
2083 if (!zfs_btree_is_core(hdr)) {
2084 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2085 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2086 VERIFY3S(tree->bt_compar(leaf->btl_elems +
2087 (hdr->bth_first + i - 1) * size,
2088 leaf->btl_elems +
2089 (hdr->bth_first + i) * size), ==, -1);
2090 }
2091 return;
2092 }
2093
2094 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2095 for (uint32_t i = 1; i < hdr->bth_count; i++) {
2096 VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
2097 node->btc_elems + i * size), ==, -1);
2098 }
2099 for (uint32_t i = 0; i < hdr->bth_count; i++) {
2100 uint8_t *left_child_last = NULL;
2101 zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
2102 if (zfs_btree_is_core(left_child_hdr)) {
2103 zfs_btree_core_t *left_child =
2104 (zfs_btree_core_t *)left_child_hdr;
2105 left_child_last = left_child->btc_elems +
2106 (left_child_hdr->bth_count - 1) * size;
2107 } else {
2108 zfs_btree_leaf_t *left_child =
2109 (zfs_btree_leaf_t *)left_child_hdr;
2110 left_child_last = left_child->btl_elems +
2111 (left_child_hdr->bth_first +
2112 left_child_hdr->bth_count - 1) * size;
2113 }
2114 int comp = tree->bt_compar(node->btc_elems + i * size,
2115 left_child_last);
2116 if (comp <= 0) {
2117 panic("btree: compar returned %d (expected 1) at "
2118 "%px %d: compar(%px, %px)", comp, node, i,
2119 node->btc_elems + i * size, left_child_last);
2120 }
2121
2122 uint8_t *right_child_first = NULL;
2123 zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
2124 if (zfs_btree_is_core(right_child_hdr)) {
2125 zfs_btree_core_t *right_child =
2126 (zfs_btree_core_t *)right_child_hdr;
2127 right_child_first = right_child->btc_elems;
2128 } else {
2129 zfs_btree_leaf_t *right_child =
2130 (zfs_btree_leaf_t *)right_child_hdr;
2131 right_child_first = right_child->btl_elems +
2132 right_child_hdr->bth_first * size;
2133 }
2134 comp = tree->bt_compar(node->btc_elems + i * size,
2135 right_child_first);
2136 if (comp >= 0) {
2137 panic("btree: compar returned %d (expected -1) at "
2138 "%px %d: compar(%px, %px)", comp, node, i,
2139 node->btc_elems + i * size, right_child_first);
2140 }
2141 }
2142 for (uint32_t i = 0; i <= hdr->bth_count; i++)
2143 zfs_btree_verify_order_helper(tree, node->btc_children[i]);
2144 }
2145
2146 /* Check that all elements in the tree are in sorted order. */
2147 static void
zfs_btree_verify_order(zfs_btree_t * tree)2148 zfs_btree_verify_order(zfs_btree_t *tree)
2149 {
2150 EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
2151 if (tree->bt_height == -1) {
2152 return;
2153 }
2154
2155 zfs_btree_verify_order_helper(tree, tree->bt_root);
2156 }
2157
2158 #ifdef ZFS_DEBUG
2159 /* Check that all unused memory is poisoned correctly. */
2160 static void
zfs_btree_verify_poison_helper(zfs_btree_t * tree,zfs_btree_hdr_t * hdr)2161 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
2162 {
2163 size_t size = tree->bt_elem_size;
2164 if (!zfs_btree_is_core(hdr)) {
2165 zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
2166 for (size_t i = 0; i < hdr->bth_first * size; i++)
2167 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2168 size_t esize = tree->bt_leaf_size -
2169 offsetof(zfs_btree_leaf_t, btl_elems);
2170 for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
2171 i < esize; i++)
2172 VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
2173 } else {
2174 zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
2175 for (size_t i = hdr->bth_count * size;
2176 i < BTREE_CORE_ELEMS * size; i++)
2177 VERIFY3U(node->btc_elems[i], ==, 0x0f);
2178
2179 for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
2180 i++) {
2181 VERIFY3P(node->btc_children[i], ==,
2182 (zfs_btree_hdr_t *)BTREE_POISON);
2183 }
2184
2185 for (uint32_t i = 0; i <= hdr->bth_count; i++) {
2186 zfs_btree_verify_poison_helper(tree,
2187 node->btc_children[i]);
2188 }
2189 }
2190 }
2191 #endif
2192
2193 /* Check that unused memory in the tree is still poisoned. */
2194 static void
zfs_btree_verify_poison(zfs_btree_t * tree)2195 zfs_btree_verify_poison(zfs_btree_t *tree)
2196 {
2197 #ifdef ZFS_DEBUG
2198 if (tree->bt_height == -1)
2199 return;
2200 zfs_btree_verify_poison_helper(tree, tree->bt_root);
2201 #else
2202 (void) tree;
2203 #endif
2204 }
2205
2206 void
zfs_btree_verify(zfs_btree_t * tree)2207 zfs_btree_verify(zfs_btree_t *tree)
2208 {
2209 if (zfs_btree_verify_intensity == 0)
2210 return;
2211 zfs_btree_verify_height(tree);
2212 if (zfs_btree_verify_intensity == 1)
2213 return;
2214 zfs_btree_verify_pointers(tree);
2215 if (zfs_btree_verify_intensity == 2)
2216 return;
2217 zfs_btree_verify_counts(tree);
2218 if (zfs_btree_verify_intensity == 3)
2219 return;
2220 zfs_btree_verify_order(tree);
2221
2222 if (zfs_btree_verify_intensity == 4)
2223 return;
2224 zfs_btree_verify_poison(tree);
2225 }
2226
2227 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
2228 "Enable btree verification. Levels above 4 require ZFS be built "
2229 "with debugging");
2230