xref: /src/sys/contrib/openzfs/module/zfs/range_tree.c (revision 8a62a2a5659d1839d8799b4274c04469d7f17c78)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29  */
30 
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37 #include <sys/sysmacros.h>
38 
39 #ifndef _KERNEL
40 /*
41  * Need the extra 'abort()' here since is possible for PANIC() to return, and
42  * our panic() usage in this file assumes it's NORETURN.
43  */
44 #define	panic(...) do {PANIC(__VA_ARGS__); abort(); } while (0);
45 #define	zfs_panic_recover(...) panic(__VA_ARGS__)
46 #endif
47 
48 /*
49  * Range trees are tree-based data structures that can be used to
50  * track free space or generally any space allocation information.
51  * A range tree keeps track of individual segments and automatically
52  * provides facilities such as adjacent extent merging and extent
53  * splitting in response to range add/remove requests.
54  *
55  * A range tree starts out completely empty, with no segments in it.
56  * Adding an allocation via zfs_range_tree_add to the range tree can either:
57  * 1) create a new extent
58  * 2) extend an adjacent extent
59  * 3) merge two adjacent extents
60  * Conversely, removing an allocation via zfs_range_tree_remove can:
61  * 1) completely remove an extent
62  * 2) shorten an extent (if the allocation was near one of its ends)
63  * 3) split an extent into two extents, in effect punching a hole
64  *
65  * A range tree is also capable of 'bridging' gaps when adding
66  * allocations. This is useful for cases when close proximity of
67  * allocations is an important detail that needs to be represented
68  * in the range tree. See zfs_range_tree_set_gap(). The default behavior
69  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
70  *
71  * In order to traverse a range tree, use either the zfs_range_tree_walk()
72  * or zfs_range_tree_vacate() functions.
73  *
74  * To obtain more accurate information on individual segment
75  * operations that the range tree performs "under the hood", you can
76  * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
77  * to the zfs_range_tree_create function. Any callbacks that are non-NULL
78  * are then called at the appropriate times.
79  *
80  * The range tree code also supports a special variant of range trees
81  * that can bridge small gaps between segments. This kind of tree is used
82  * by the dsl scanning code to group I/Os into mostly sequential chunks to
83  * optimize disk performance. The code here attempts to do this with as
84  * little memory and computational overhead as possible. One limitation of
85  * this implementation is that segments of range trees with gaps can only
86  * support removing complete segments.
87  */
88 
89 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)90 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
91 {
92 	ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
93 	size_t size = 0;
94 	switch (rt->rt_type) {
95 	case ZFS_RANGE_SEG32:
96 		size = sizeof (zfs_range_seg32_t);
97 		break;
98 	case ZFS_RANGE_SEG64:
99 		size = sizeof (zfs_range_seg64_t);
100 		break;
101 	case ZFS_RANGE_SEG_GAP:
102 		size = sizeof (zfs_range_seg_gap_t);
103 		break;
104 	default:
105 		__builtin_unreachable();
106 	}
107 	memcpy(dest, src, size);
108 }
109 
110 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)111 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
112 {
113 	zfs_range_seg_t *rs;
114 	zfs_btree_index_t where;
115 	uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
116 	int i;
117 
118 	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
119 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
120 		uint64_t size = zfs_rs_get_end(rs, rt) -
121 		    zfs_rs_get_start(rs, rt);
122 		int idx	= highbit64(size) - 1;
123 
124 		hist[idx]++;
125 		ASSERT3U(hist[idx], !=, 0);
126 	}
127 
128 	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
129 		VERIFY3UF(hist[i], ==, rt->rt_histogram[i],
130 		    "i=%d, hist=%px, hist=%llu, rt_hist=%llu",
131 		    i, hist, (u_longlong_t)hist[i],
132 		    (u_longlong_t)rt->rt_histogram[i]);
133 	}
134 }
135 
136 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)137 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
138 {
139 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
140 	int idx = highbit64(size) - 1;
141 
142 	ASSERT(size != 0);
143 	ASSERT3U(idx, <,
144 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
145 
146 	rt->rt_histogram[idx]++;
147 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
148 }
149 
150 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)151 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
152 {
153 	uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
154 	int idx = highbit64(size) - 1;
155 
156 	ASSERT(size != 0);
157 	ASSERT3U(idx, <,
158 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
159 
160 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
161 	rt->rt_histogram[idx]--;
162 }
163 
164 __attribute__((always_inline)) inline
165 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)166 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
167 {
168 	const zfs_range_seg32_t *r1 = x1;
169 	const zfs_range_seg32_t *r2 = x2;
170 
171 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
172 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
173 
174 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
175 }
176 
177 __attribute__((always_inline)) inline
178 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)179 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
180 {
181 	const zfs_range_seg64_t *r1 = x1;
182 	const zfs_range_seg64_t *r2 = x2;
183 
184 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
185 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
186 
187 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
188 }
189 
190 __attribute__((always_inline)) inline
191 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)192 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
193 {
194 	const zfs_range_seg_gap_t *r1 = x1;
195 	const zfs_range_seg_gap_t *r2 = x2;
196 
197 	ASSERT3U(r1->rs_start, <=, r1->rs_end);
198 	ASSERT3U(r2->rs_start, <=, r2->rs_end);
199 
200 	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
201 }
202 
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)203 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
204     zfs_range_tree_seg32_compare)
205 
206 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
207     zfs_range_tree_seg64_compare)
208 
209 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
210     zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
211 
212 static zfs_range_tree_t *
213 zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
214     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
215     uint64_t gap, uint64_t flags, const char *name)
216 {
217 	zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
218 
219 	ASSERT3U(shift, <, 64);
220 	ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
221 	size_t size;
222 	int (*compare) (const void *, const void *);
223 	bt_find_in_buf_f bt_find;
224 	switch (type) {
225 	case ZFS_RANGE_SEG32:
226 		size = sizeof (zfs_range_seg32_t);
227 		compare = zfs_range_tree_seg32_compare;
228 		bt_find = zfs_range_tree_seg32_find_in_buf;
229 		break;
230 	case ZFS_RANGE_SEG64:
231 		size = sizeof (zfs_range_seg64_t);
232 		compare = zfs_range_tree_seg64_compare;
233 		bt_find = zfs_range_tree_seg64_find_in_buf;
234 		break;
235 	case ZFS_RANGE_SEG_GAP:
236 		size = sizeof (zfs_range_seg_gap_t);
237 		compare = zfs_range_tree_seg_gap_compare;
238 		bt_find = zfs_range_tree_seg_gap_find_in_buf;
239 		break;
240 	default:
241 		panic("Invalid range seg type %d", type);
242 	}
243 	zfs_btree_create(&rt->rt_root, compare, bt_find, size);
244 
245 	rt->rt_ops = ops;
246 	rt->rt_gap = gap;
247 	rt->rt_flags = flags;
248 	rt->rt_name = name;
249 	rt->rt_arg = arg;
250 	rt->rt_type = type;
251 	rt->rt_start = start;
252 	rt->rt_shift = shift;
253 
254 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
255 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
256 
257 	return (rt);
258 }
259 
260 zfs_range_tree_t *
zfs_range_tree_create_gap(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t gap)261 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
262     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
263     uint64_t gap)
264 {
265 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
266 	    0, NULL));
267 }
268 
269 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)270 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
271     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
272 {
273 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
274 	    0, NULL));
275 }
276 
277 zfs_range_tree_t *
zfs_range_tree_create_flags(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t flags,const char * name)278 zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
279     zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
280     uint64_t flags, const char *name)
281 {
282 	return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
283 	    flags, name));
284 }
285 
286 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)287 zfs_range_tree_destroy(zfs_range_tree_t *rt)
288 {
289 	VERIFY0(rt->rt_space);
290 
291 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
292 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
293 
294 	if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
295 		kmem_strfree((char *)(uintptr_t)rt->rt_name);
296 
297 	zfs_btree_destroy(&rt->rt_root);
298 	kmem_free(rt, sizeof (*rt));
299 }
300 
301 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)302 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
303     int64_t delta)
304 {
305 	if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
306 		zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
307 		    "or below 0; probable double remove in segment [%llx:%llx]",
308 		    ZFS_RT_NAME(rt),
309 		    (longlong_t)zfs_rs_get_start(rs, rt),
310 		    (longlong_t)zfs_rs_get_end(rs, rt));
311 	}
312 	if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
313 	    zfs_rs_get_start(rs, rt)) {
314 		zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
315 		    "beyond max; probable double add in segment [%llx:%llx]",
316 		    ZFS_RT_NAME(rt),
317 		    (longlong_t)zfs_rs_get_start(rs, rt),
318 		    (longlong_t)zfs_rs_get_end(rs, rt));
319 	}
320 
321 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
322 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
323 	zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
324 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
325 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
326 }
327 
328 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)329 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
330 {
331 	zfs_range_tree_t *rt = arg;
332 	zfs_btree_index_t where;
333 	zfs_range_seg_t *rs_before, *rs_after, *rs;
334 	zfs_range_seg_max_t tmp, rsearch;
335 	uint64_t end = start + size, gap = rt->rt_gap;
336 	uint64_t bridge_size = 0;
337 	boolean_t merge_before, merge_after;
338 
339 	ASSERT3U(size, !=, 0);
340 	ASSERT3U(fill, <=, size);
341 	ASSERT3U(start + size, >, start);
342 
343 	zfs_rs_set_start(&rsearch, rt, start);
344 	zfs_rs_set_end(&rsearch, rt, end);
345 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
346 
347 	/*
348 	 * If this is a gap-supporting range tree, it is possible that we
349 	 * are inserting into an existing segment. In this case simply
350 	 * bump the fill count and call the remove / add callbacks. If the
351 	 * new range will extend an existing segment, we remove the
352 	 * existing one, apply the new extent to it and re-insert it using
353 	 * the normal code paths.
354 	 */
355 	if (rs != NULL) {
356 		uint64_t rstart = zfs_rs_get_start(rs, rt);
357 		uint64_t rend = zfs_rs_get_end(rs, rt);
358 		if (gap == 0) {
359 			zfs_panic_recover("zfs: rt=%s: adding segment "
360 			    "(offset=%llx size=%llx) overlapping with existing "
361 			    "one (offset=%llx size=%llx)",
362 			    ZFS_RT_NAME(rt),
363 			    (longlong_t)start, (longlong_t)size,
364 			    (longlong_t)rstart, (longlong_t)(rend - rstart));
365 			return;
366 		}
367 		if (rstart <= start && rend >= end) {
368 			zfs_range_tree_adjust_fill(rt, rs, fill);
369 			return;
370 		}
371 
372 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
373 			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
374 
375 		zfs_range_tree_stat_decr(rt, rs);
376 		rt->rt_space -= rend - rstart;
377 
378 		fill += zfs_rs_get_fill(rs, rt);
379 		start = MIN(start, rstart);
380 		end = MAX(end, rend);
381 		size = end - start;
382 
383 		zfs_btree_remove(&rt->rt_root, rs);
384 		zfs_range_tree_add_impl(rt, start, size, fill);
385 		return;
386 	}
387 
388 	ASSERT0P(rs);
389 
390 	/*
391 	 * Determine whether or not we will have to merge with our neighbors.
392 	 * If gap != 0, we might need to merge with our neighbors even if we
393 	 * aren't directly touching.
394 	 */
395 	zfs_btree_index_t where_before, where_after;
396 	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
397 	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
398 
399 	merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
400 	    start - gap);
401 	merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
402 	    end + gap);
403 
404 	if (merge_before && gap != 0)
405 		bridge_size += start - zfs_rs_get_end(rs_before, rt);
406 	if (merge_after && gap != 0)
407 		bridge_size += zfs_rs_get_start(rs_after, rt) - end;
408 
409 	if (merge_before && merge_after) {
410 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
411 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
412 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
413 		}
414 
415 		zfs_range_tree_stat_decr(rt, rs_before);
416 		zfs_range_tree_stat_decr(rt, rs_after);
417 
418 		zfs_rs_copy(rs_after, &tmp, rt);
419 		uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
420 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
421 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
422 		zfs_btree_remove_idx(&rt->rt_root, &where_before);
423 
424 		/*
425 		 * We have to re-find the node because our old reference is
426 		 * invalid as soon as we do any mutating btree operations.
427 		 */
428 		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
429 		ASSERT3P(rs_after, !=, NULL);
430 		zfs_rs_set_start_raw(rs_after, rt, before_start);
431 		zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
432 		rs = rs_after;
433 	} else if (merge_before) {
434 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
435 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
436 
437 		zfs_range_tree_stat_decr(rt, rs_before);
438 
439 		uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
440 		zfs_rs_set_end(rs_before, rt, end);
441 		zfs_rs_set_fill(rs_before, rt, before_fill + fill);
442 		rs = rs_before;
443 	} else if (merge_after) {
444 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
445 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
446 
447 		zfs_range_tree_stat_decr(rt, rs_after);
448 
449 		uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
450 		zfs_rs_set_start(rs_after, rt, start);
451 		zfs_rs_set_fill(rs_after, rt, after_fill + fill);
452 		rs = rs_after;
453 	} else {
454 		rs = &tmp;
455 
456 		zfs_rs_set_start(rs, rt, start);
457 		zfs_rs_set_end(rs, rt, end);
458 		zfs_rs_set_fill(rs, rt, fill);
459 		zfs_btree_add_idx(&rt->rt_root, rs, &where);
460 	}
461 
462 	if (gap != 0) {
463 		ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
464 		    zfs_rs_get_start(rs, rt));
465 	} else {
466 		ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
467 		    zfs_rs_get_start(rs, rt));
468 	}
469 
470 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
471 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
472 
473 	zfs_range_tree_stat_incr(rt, rs);
474 	rt->rt_space += size + bridge_size;
475 }
476 
477 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)478 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
479 {
480 	zfs_range_tree_add_impl(arg, start, size, size);
481 }
482 
483 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)484 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
485     boolean_t do_fill)
486 {
487 	zfs_btree_index_t where;
488 	zfs_range_seg_t *rs;
489 	zfs_range_seg_max_t rsearch, rs_tmp;
490 	uint64_t end = start + size;
491 	uint64_t rstart, rend;
492 	boolean_t left_over, right_over;
493 
494 	VERIFY3U(size, !=, 0);
495 	VERIFY3U(size, <=, rt->rt_space);
496 	if (rt->rt_type == ZFS_RANGE_SEG64)
497 		ASSERT3U(start + size, >, start);
498 
499 	zfs_rs_set_start(&rsearch, rt, start);
500 	zfs_rs_set_end(&rsearch, rt, end);
501 	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
502 
503 	/* Make sure we completely overlap with someone */
504 	if (rs == NULL) {
505 		zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
506 		    "from range tree (offset=%llx size=%llx)",
507 		    ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
508 		return;
509 	}
510 
511 	rstart = zfs_rs_get_start(rs, rt);
512 	rend = zfs_rs_get_end(rs, rt);
513 
514 	/*
515 	 * Range trees with gap support must only remove complete segments
516 	 * from the tree. This allows us to maintain accurate fill accounting
517 	 * and to ensure that bridged sections are not leaked. If we need to
518 	 * remove less than the full segment, we can only adjust the fill count.
519 	 */
520 	if (rt->rt_gap != 0) {
521 		if (do_fill) {
522 			if (zfs_rs_get_fill(rs, rt) == size) {
523 				start = rstart;
524 				end = rend;
525 				size = end - start;
526 			} else {
527 				zfs_range_tree_adjust_fill(rt, rs, -size);
528 				return;
529 			}
530 		} else if (rstart != start || rend != end) {
531 			zfs_panic_recover("zfs: rt=%s: freeing partial segment "
532 			    "of gap tree (offset=%llx size=%llx) of "
533 			    "(offset=%llx size=%llx)",
534 			    ZFS_RT_NAME(rt),
535 			    (longlong_t)start, (longlong_t)size,
536 			    (longlong_t)rstart, (longlong_t)(rend - rstart));
537 			return;
538 		}
539 	}
540 
541 	if (!(rstart <= start && rend >= end)) {
542 		zfs_panic_recover("zfs: rt=%s: removing segment "
543 		    "(offset=%llx size=%llx) not completely overlapped by "
544 		    "existing one (offset=%llx size=%llx)",
545 		    ZFS_RT_NAME(rt),
546 		    (longlong_t)start, (longlong_t)size,
547 		    (longlong_t)rstart, (longlong_t)(rend - rstart));
548 		return;
549 	}
550 
551 	left_over = (rstart != start);
552 	right_over = (rend != end);
553 
554 	zfs_range_tree_stat_decr(rt, rs);
555 
556 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
557 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
558 
559 	if (left_over && right_over) {
560 		zfs_range_seg_max_t newseg;
561 		zfs_rs_set_start(&newseg, rt, end);
562 		zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
563 		zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
564 		zfs_range_tree_stat_incr(rt, &newseg);
565 
566 		// This modifies the buffer already inside the range tree
567 		zfs_rs_set_end(rs, rt, start);
568 
569 		zfs_rs_copy(rs, &rs_tmp, rt);
570 		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
571 			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
572 		else
573 			zfs_btree_add(&rt->rt_root, &newseg);
574 
575 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
576 			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
577 	} else if (left_over) {
578 		// This modifies the buffer already inside the range tree
579 		zfs_rs_set_end(rs, rt, start);
580 		zfs_rs_copy(rs, &rs_tmp, rt);
581 	} else if (right_over) {
582 		// This modifies the buffer already inside the range tree
583 		zfs_rs_set_start(rs, rt, end);
584 		zfs_rs_copy(rs, &rs_tmp, rt);
585 	} else {
586 		zfs_btree_remove_idx(&rt->rt_root, &where);
587 		rs = NULL;
588 	}
589 
590 	if (rs != NULL) {
591 		/*
592 		 * The fill of the leftover segment will always be equal to
593 		 * the size, since we do not support removing partial segments
594 		 * of range trees with gaps.
595 		 */
596 		zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
597 		    zfs_rs_get_start_raw(rs, rt));
598 		zfs_range_tree_stat_incr(rt, &rs_tmp);
599 
600 		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
601 			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
602 	}
603 
604 	rt->rt_space -= size;
605 }
606 
607 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)608 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
609 {
610 	zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
611 }
612 
613 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)614 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
615 {
616 	zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
617 }
618 
619 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)620 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
621     uint64_t newstart, uint64_t newsize)
622 {
623 	int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
624 	    zfs_rs_get_start(rs, rt));
625 
626 	zfs_range_tree_stat_decr(rt, rs);
627 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
628 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
629 
630 	zfs_rs_set_start(rs, rt, newstart);
631 	zfs_rs_set_end(rs, rt, newstart + newsize);
632 
633 	zfs_range_tree_stat_incr(rt, rs);
634 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
635 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
636 
637 	rt->rt_space += delta;
638 }
639 
640 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)641 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
642 {
643 	zfs_range_seg_max_t rsearch;
644 	uint64_t end = start + size;
645 
646 	VERIFY(size != 0);
647 
648 	zfs_rs_set_start(&rsearch, rt, start);
649 	zfs_rs_set_end(&rsearch, rt, end);
650 	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
651 }
652 
653 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)654 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
655 {
656 	if (rt->rt_type == ZFS_RANGE_SEG64)
657 		ASSERT3U(start + size, >, start);
658 
659 	zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
660 	if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
661 	    zfs_rs_get_end(rs, rt) >= start + size) {
662 		return (rs);
663 	}
664 	return (NULL);
665 }
666 
667 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)668 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
669     uint64_t size)
670 {
671 	zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
672 	if (rs != NULL)
673 		panic("segment already in tree; rs=%p", (void *)rs);
674 }
675 
676 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)677 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
678 {
679 	return (zfs_range_tree_find(rt, start, size) != NULL);
680 }
681 
682 /*
683  * Returns the first subset of the given range which overlaps with the range
684  * tree. Returns true if there is a segment in the range, and false if there
685  * isn't.
686  */
687 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)688 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
689     uint64_t *ostart, uint64_t *osize)
690 {
691 	if (rt->rt_type == ZFS_RANGE_SEG64)
692 		ASSERT3U(start + size, >, start);
693 
694 	zfs_range_seg_max_t rsearch;
695 	zfs_rs_set_start(&rsearch, rt, start);
696 	zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
697 	    1);
698 
699 	zfs_btree_index_t where;
700 	zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
701 	if (rs != NULL) {
702 		*ostart = start;
703 		*osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
704 		return (B_TRUE);
705 	}
706 
707 	rs = zfs_btree_next(&rt->rt_root, &where, &where);
708 	if (rs == NULL || zfs_rs_get_start(rs, rt) >= start + size)
709 		return (B_FALSE);
710 
711 	*ostart = zfs_rs_get_start(rs, rt);
712 	*osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
713 	    zfs_rs_get_start(rs, rt);
714 	return (B_TRUE);
715 }
716 
717 /*
718  * Ensure that this range is not in the tree, regardless of whether
719  * it is currently in the tree.
720  */
721 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)722 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
723 {
724 	zfs_range_seg_t *rs;
725 
726 	if (size == 0)
727 		return;
728 
729 	if (rt->rt_type == ZFS_RANGE_SEG64)
730 		ASSERT3U(start + size, >, start);
731 
732 	while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
733 		uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
734 		uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
735 		zfs_range_tree_remove(rt, free_start, free_end - free_start);
736 	}
737 }
738 
739 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)740 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
741 {
742 	zfs_range_tree_t *rt;
743 
744 	ASSERT0(zfs_range_tree_space(*rtdst));
745 	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
746 
747 	rt = *rtsrc;
748 	*rtsrc = *rtdst;
749 	*rtdst = rt;
750 }
751 
752 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)753 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
754     void *arg)
755 {
756 	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
757 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
758 
759 	if (func != NULL) {
760 		zfs_range_seg_t *rs;
761 		zfs_btree_index_t *cookie = NULL;
762 
763 		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
764 		    NULL) {
765 			func(arg, zfs_rs_get_start(rs, rt),
766 			    zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
767 		}
768 	} else {
769 		zfs_btree_clear(&rt->rt_root);
770 	}
771 
772 	memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
773 	rt->rt_space = 0;
774 }
775 
776 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)777 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
778     void *arg)
779 {
780 	zfs_btree_index_t where;
781 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
782 	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
783 		func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
784 		    zfs_rs_get_start(rs, rt));
785 	}
786 }
787 
788 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)789 zfs_range_tree_first(zfs_range_tree_t *rt)
790 {
791 	return (zfs_btree_first(&rt->rt_root, NULL));
792 }
793 
794 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)795 zfs_range_tree_space(zfs_range_tree_t *rt)
796 {
797 	return (rt->rt_space);
798 }
799 
800 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)801 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
802 {
803 	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
804 }
805 
806 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)807 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
808 {
809 	ASSERT(rt != NULL);
810 	return (zfs_range_tree_space(rt) == 0);
811 }
812 
813 /*
814  * Remove any overlapping ranges between the given segment [start, end)
815  * from removefrom. Add non-overlapping leftovers to addto.
816  */
817 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)818 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
819     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
820 {
821 	zfs_btree_index_t where;
822 	zfs_range_seg_max_t starting_rs;
823 	zfs_rs_set_start(&starting_rs, removefrom, start);
824 	zfs_rs_set_end_raw(&starting_rs, removefrom,
825 	    zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
826 
827 	zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
828 	    &starting_rs, &where);
829 
830 	if (curr == NULL)
831 		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
832 
833 	zfs_range_seg_t *next;
834 	for (; curr != NULL; curr = next) {
835 		if (start == end)
836 			return;
837 		VERIFY3U(start, <, end);
838 
839 		/* there is no overlap */
840 		if (end <= zfs_rs_get_start(curr, removefrom)) {
841 			zfs_range_tree_add(addto, start, end - start);
842 			return;
843 		}
844 
845 		uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
846 		    start);
847 		uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
848 		    end);
849 		uint64_t overlap_size = overlap_end - overlap_start;
850 		ASSERT3S(overlap_size, >, 0);
851 		zfs_range_seg_max_t rs;
852 		zfs_rs_copy(curr, &rs, removefrom);
853 
854 		zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
855 
856 		if (start < overlap_start)
857 			zfs_range_tree_add(addto, start, overlap_start - start);
858 
859 		start = overlap_end;
860 		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
861 		/*
862 		 * If we find something here, we only removed part of the
863 		 * curr segment. Either there's some left at the end
864 		 * because we've reached the end of the range we're removing,
865 		 * or there's some left at the start because we started
866 		 * partway through the range.  Either way, we continue with
867 		 * the loop. If it's the former, we'll return at the start of
868 		 * the loop, and if it's the latter we'll see if there is more
869 		 * area to process.
870 		 */
871 		if (next != NULL) {
872 			ASSERT(start == end || start == zfs_rs_get_end(&rs,
873 			    removefrom));
874 		}
875 
876 		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
877 	}
878 	VERIFY0P(curr);
879 
880 	if (start != end) {
881 		VERIFY3U(start, <, end);
882 		zfs_range_tree_add(addto, start, end - start);
883 	} else {
884 		VERIFY3U(start, ==, end);
885 	}
886 }
887 
888 /*
889  * For each entry in rt, if it exists in removefrom, remove it
890  * from removefrom. Otherwise, add it to addto.
891  */
892 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)893 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
894     zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
895 {
896 	zfs_btree_index_t where;
897 	for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
898 	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
899 		zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
900 		    zfs_rs_get_end(rs, rt), removefrom, addto);
901 	}
902 }
903 
904 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)905 zfs_range_tree_min(zfs_range_tree_t *rt)
906 {
907 	zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
908 	return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
909 }
910 
911 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)912 zfs_range_tree_max(zfs_range_tree_t *rt)
913 {
914 	zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
915 	return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
916 }
917 
918 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)919 zfs_range_tree_span(zfs_range_tree_t *rt)
920 {
921 	return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
922 }
923