1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /*
27 * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
28 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/spa.h>
33 #include <sys/dmu.h>
34 #include <sys/dnode.h>
35 #include <sys/zio.h>
36 #include <sys/range_tree.h>
37 #include <sys/sysmacros.h>
38
39 #ifndef _KERNEL
40 /*
41 * Need the extra 'abort()' here since is possible for PANIC() to return, and
42 * our panic() usage in this file assumes it's NORETURN.
43 */
44 #define panic(...) do {PANIC(__VA_ARGS__); abort(); } while (0);
45 #define zfs_panic_recover(...) panic(__VA_ARGS__)
46 #endif
47
48 /*
49 * Range trees are tree-based data structures that can be used to
50 * track free space or generally any space allocation information.
51 * A range tree keeps track of individual segments and automatically
52 * provides facilities such as adjacent extent merging and extent
53 * splitting in response to range add/remove requests.
54 *
55 * A range tree starts out completely empty, with no segments in it.
56 * Adding an allocation via zfs_range_tree_add to the range tree can either:
57 * 1) create a new extent
58 * 2) extend an adjacent extent
59 * 3) merge two adjacent extents
60 * Conversely, removing an allocation via zfs_range_tree_remove can:
61 * 1) completely remove an extent
62 * 2) shorten an extent (if the allocation was near one of its ends)
63 * 3) split an extent into two extents, in effect punching a hole
64 *
65 * A range tree is also capable of 'bridging' gaps when adding
66 * allocations. This is useful for cases when close proximity of
67 * allocations is an important detail that needs to be represented
68 * in the range tree. See zfs_range_tree_set_gap(). The default behavior
69 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
70 *
71 * In order to traverse a range tree, use either the zfs_range_tree_walk()
72 * or zfs_range_tree_vacate() functions.
73 *
74 * To obtain more accurate information on individual segment
75 * operations that the range tree performs "under the hood", you can
76 * specify a set of callbacks by passing a zfs_range_tree_ops_t structure
77 * to the zfs_range_tree_create function. Any callbacks that are non-NULL
78 * are then called at the appropriate times.
79 *
80 * The range tree code also supports a special variant of range trees
81 * that can bridge small gaps between segments. This kind of tree is used
82 * by the dsl scanning code to group I/Os into mostly sequential chunks to
83 * optimize disk performance. The code here attempts to do this with as
84 * little memory and computational overhead as possible. One limitation of
85 * this implementation is that segments of range trees with gaps can only
86 * support removing complete segments.
87 */
88
89 static inline void
zfs_rs_copy(zfs_range_seg_t * src,zfs_range_seg_t * dest,zfs_range_tree_t * rt)90 zfs_rs_copy(zfs_range_seg_t *src, zfs_range_seg_t *dest, zfs_range_tree_t *rt)
91 {
92 ASSERT3U(rt->rt_type, <, ZFS_RANGE_SEG_NUM_TYPES);
93 size_t size = 0;
94 switch (rt->rt_type) {
95 case ZFS_RANGE_SEG32:
96 size = sizeof (zfs_range_seg32_t);
97 break;
98 case ZFS_RANGE_SEG64:
99 size = sizeof (zfs_range_seg64_t);
100 break;
101 case ZFS_RANGE_SEG_GAP:
102 size = sizeof (zfs_range_seg_gap_t);
103 break;
104 default:
105 __builtin_unreachable();
106 }
107 memcpy(dest, src, size);
108 }
109
110 void
zfs_range_tree_stat_verify(zfs_range_tree_t * rt)111 zfs_range_tree_stat_verify(zfs_range_tree_t *rt)
112 {
113 zfs_range_seg_t *rs;
114 zfs_btree_index_t where;
115 uint64_t hist[ZFS_RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
116 int i;
117
118 for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
119 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
120 uint64_t size = zfs_rs_get_end(rs, rt) -
121 zfs_rs_get_start(rs, rt);
122 int idx = highbit64(size) - 1;
123
124 hist[idx]++;
125 ASSERT3U(hist[idx], !=, 0);
126 }
127
128 for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
129 VERIFY3UF(hist[i], ==, rt->rt_histogram[i],
130 "i=%d, hist=%px, hist=%llu, rt_hist=%llu",
131 i, hist, (u_longlong_t)hist[i],
132 (u_longlong_t)rt->rt_histogram[i]);
133 }
134 }
135
136 static void
zfs_range_tree_stat_incr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)137 zfs_range_tree_stat_incr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
138 {
139 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
140 int idx = highbit64(size) - 1;
141
142 ASSERT(size != 0);
143 ASSERT3U(idx, <,
144 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
145
146 rt->rt_histogram[idx]++;
147 ASSERT3U(rt->rt_histogram[idx], !=, 0);
148 }
149
150 static void
zfs_range_tree_stat_decr(zfs_range_tree_t * rt,zfs_range_seg_t * rs)151 zfs_range_tree_stat_decr(zfs_range_tree_t *rt, zfs_range_seg_t *rs)
152 {
153 uint64_t size = zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt);
154 int idx = highbit64(size) - 1;
155
156 ASSERT(size != 0);
157 ASSERT3U(idx, <,
158 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
159
160 ASSERT3U(rt->rt_histogram[idx], !=, 0);
161 rt->rt_histogram[idx]--;
162 }
163
164 __attribute__((always_inline)) inline
165 static int
zfs_range_tree_seg32_compare(const void * x1,const void * x2)166 zfs_range_tree_seg32_compare(const void *x1, const void *x2)
167 {
168 const zfs_range_seg32_t *r1 = x1;
169 const zfs_range_seg32_t *r2 = x2;
170
171 ASSERT3U(r1->rs_start, <=, r1->rs_end);
172 ASSERT3U(r2->rs_start, <=, r2->rs_end);
173
174 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
175 }
176
177 __attribute__((always_inline)) inline
178 static int
zfs_range_tree_seg64_compare(const void * x1,const void * x2)179 zfs_range_tree_seg64_compare(const void *x1, const void *x2)
180 {
181 const zfs_range_seg64_t *r1 = x1;
182 const zfs_range_seg64_t *r2 = x2;
183
184 ASSERT3U(r1->rs_start, <=, r1->rs_end);
185 ASSERT3U(r2->rs_start, <=, r2->rs_end);
186
187 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
188 }
189
190 __attribute__((always_inline)) inline
191 static int
zfs_range_tree_seg_gap_compare(const void * x1,const void * x2)192 zfs_range_tree_seg_gap_compare(const void *x1, const void *x2)
193 {
194 const zfs_range_seg_gap_t *r1 = x1;
195 const zfs_range_seg_gap_t *r2 = x2;
196
197 ASSERT3U(r1->rs_start, <=, r1->rs_end);
198 ASSERT3U(r2->rs_start, <=, r2->rs_end);
199
200 return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
201 }
202
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf,zfs_range_seg32_t,zfs_range_tree_seg32_compare)203 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg32_find_in_buf, zfs_range_seg32_t,
204 zfs_range_tree_seg32_compare)
205
206 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
207 zfs_range_tree_seg64_compare)
208
209 ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
210 zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
211
212 static zfs_range_tree_t *
213 zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
214 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
215 uint64_t gap, uint64_t flags, const char *name)
216 {
217 zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
218
219 ASSERT3U(shift, <, 64);
220 ASSERT3U(type, <=, ZFS_RANGE_SEG_NUM_TYPES);
221 size_t size;
222 int (*compare) (const void *, const void *);
223 bt_find_in_buf_f bt_find;
224 switch (type) {
225 case ZFS_RANGE_SEG32:
226 size = sizeof (zfs_range_seg32_t);
227 compare = zfs_range_tree_seg32_compare;
228 bt_find = zfs_range_tree_seg32_find_in_buf;
229 break;
230 case ZFS_RANGE_SEG64:
231 size = sizeof (zfs_range_seg64_t);
232 compare = zfs_range_tree_seg64_compare;
233 bt_find = zfs_range_tree_seg64_find_in_buf;
234 break;
235 case ZFS_RANGE_SEG_GAP:
236 size = sizeof (zfs_range_seg_gap_t);
237 compare = zfs_range_tree_seg_gap_compare;
238 bt_find = zfs_range_tree_seg_gap_find_in_buf;
239 break;
240 default:
241 panic("Invalid range seg type %d", type);
242 }
243 zfs_btree_create(&rt->rt_root, compare, bt_find, size);
244
245 rt->rt_ops = ops;
246 rt->rt_gap = gap;
247 rt->rt_flags = flags;
248 rt->rt_name = name;
249 rt->rt_arg = arg;
250 rt->rt_type = type;
251 rt->rt_start = start;
252 rt->rt_shift = shift;
253
254 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
255 rt->rt_ops->rtop_create(rt, rt->rt_arg);
256
257 return (rt);
258 }
259
260 zfs_range_tree_t *
zfs_range_tree_create_gap(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t gap)261 zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
262 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
263 uint64_t gap)
264 {
265 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
266 0, NULL));
267 }
268
269 zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift)270 zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
271 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
272 {
273 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
274 0, NULL));
275 }
276
277 zfs_range_tree_t *
zfs_range_tree_create_flags(const zfs_range_tree_ops_t * ops,zfs_range_seg_type_t type,void * arg,uint64_t start,uint64_t shift,uint64_t flags,const char * name)278 zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
279 zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
280 uint64_t flags, const char *name)
281 {
282 return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
283 flags, name));
284 }
285
286 void
zfs_range_tree_destroy(zfs_range_tree_t * rt)287 zfs_range_tree_destroy(zfs_range_tree_t *rt)
288 {
289 VERIFY0(rt->rt_space);
290
291 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
292 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
293
294 if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
295 kmem_strfree((char *)(uintptr_t)rt->rt_name);
296
297 zfs_btree_destroy(&rt->rt_root);
298 kmem_free(rt, sizeof (*rt));
299 }
300
301 void
zfs_range_tree_adjust_fill(zfs_range_tree_t * rt,zfs_range_seg_t * rs,int64_t delta)302 zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
303 int64_t delta)
304 {
305 if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
306 zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
307 "or below 0; probable double remove in segment [%llx:%llx]",
308 ZFS_RT_NAME(rt),
309 (longlong_t)zfs_rs_get_start(rs, rt),
310 (longlong_t)zfs_rs_get_end(rs, rt));
311 }
312 if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
313 zfs_rs_get_start(rs, rt)) {
314 zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
315 "beyond max; probable double add in segment [%llx:%llx]",
316 ZFS_RT_NAME(rt),
317 (longlong_t)zfs_rs_get_start(rs, rt),
318 (longlong_t)zfs_rs_get_end(rs, rt));
319 }
320
321 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
322 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
323 zfs_rs_set_fill(rs, rt, zfs_rs_get_fill(rs, rt) + delta);
324 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
325 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
326 }
327
328 static void
zfs_range_tree_add_impl(void * arg,uint64_t start,uint64_t size,uint64_t fill)329 zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
330 {
331 zfs_range_tree_t *rt = arg;
332 zfs_btree_index_t where;
333 zfs_range_seg_t *rs_before, *rs_after, *rs;
334 zfs_range_seg_max_t tmp, rsearch;
335 uint64_t end = start + size, gap = rt->rt_gap;
336 uint64_t bridge_size = 0;
337 boolean_t merge_before, merge_after;
338
339 ASSERT3U(size, !=, 0);
340 ASSERT3U(fill, <=, size);
341 ASSERT3U(start + size, >, start);
342
343 zfs_rs_set_start(&rsearch, rt, start);
344 zfs_rs_set_end(&rsearch, rt, end);
345 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
346
347 /*
348 * If this is a gap-supporting range tree, it is possible that we
349 * are inserting into an existing segment. In this case simply
350 * bump the fill count and call the remove / add callbacks. If the
351 * new range will extend an existing segment, we remove the
352 * existing one, apply the new extent to it and re-insert it using
353 * the normal code paths.
354 */
355 if (rs != NULL) {
356 uint64_t rstart = zfs_rs_get_start(rs, rt);
357 uint64_t rend = zfs_rs_get_end(rs, rt);
358 if (gap == 0) {
359 zfs_panic_recover("zfs: rt=%s: adding segment "
360 "(offset=%llx size=%llx) overlapping with existing "
361 "one (offset=%llx size=%llx)",
362 ZFS_RT_NAME(rt),
363 (longlong_t)start, (longlong_t)size,
364 (longlong_t)rstart, (longlong_t)(rend - rstart));
365 return;
366 }
367 if (rstart <= start && rend >= end) {
368 zfs_range_tree_adjust_fill(rt, rs, fill);
369 return;
370 }
371
372 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
373 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
374
375 zfs_range_tree_stat_decr(rt, rs);
376 rt->rt_space -= rend - rstart;
377
378 fill += zfs_rs_get_fill(rs, rt);
379 start = MIN(start, rstart);
380 end = MAX(end, rend);
381 size = end - start;
382
383 zfs_btree_remove(&rt->rt_root, rs);
384 zfs_range_tree_add_impl(rt, start, size, fill);
385 return;
386 }
387
388 ASSERT0P(rs);
389
390 /*
391 * Determine whether or not we will have to merge with our neighbors.
392 * If gap != 0, we might need to merge with our neighbors even if we
393 * aren't directly touching.
394 */
395 zfs_btree_index_t where_before, where_after;
396 rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
397 rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
398
399 merge_before = (rs_before != NULL && zfs_rs_get_end(rs_before, rt) >=
400 start - gap);
401 merge_after = (rs_after != NULL && zfs_rs_get_start(rs_after, rt) <=
402 end + gap);
403
404 if (merge_before && gap != 0)
405 bridge_size += start - zfs_rs_get_end(rs_before, rt);
406 if (merge_after && gap != 0)
407 bridge_size += zfs_rs_get_start(rs_after, rt) - end;
408
409 if (merge_before && merge_after) {
410 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
411 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
412 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
413 }
414
415 zfs_range_tree_stat_decr(rt, rs_before);
416 zfs_range_tree_stat_decr(rt, rs_after);
417
418 zfs_rs_copy(rs_after, &tmp, rt);
419 uint64_t before_start = zfs_rs_get_start_raw(rs_before, rt);
420 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
421 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
422 zfs_btree_remove_idx(&rt->rt_root, &where_before);
423
424 /*
425 * We have to re-find the node because our old reference is
426 * invalid as soon as we do any mutating btree operations.
427 */
428 rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
429 ASSERT3P(rs_after, !=, NULL);
430 zfs_rs_set_start_raw(rs_after, rt, before_start);
431 zfs_rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
432 rs = rs_after;
433 } else if (merge_before) {
434 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
435 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
436
437 zfs_range_tree_stat_decr(rt, rs_before);
438
439 uint64_t before_fill = zfs_rs_get_fill(rs_before, rt);
440 zfs_rs_set_end(rs_before, rt, end);
441 zfs_rs_set_fill(rs_before, rt, before_fill + fill);
442 rs = rs_before;
443 } else if (merge_after) {
444 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
445 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
446
447 zfs_range_tree_stat_decr(rt, rs_after);
448
449 uint64_t after_fill = zfs_rs_get_fill(rs_after, rt);
450 zfs_rs_set_start(rs_after, rt, start);
451 zfs_rs_set_fill(rs_after, rt, after_fill + fill);
452 rs = rs_after;
453 } else {
454 rs = &tmp;
455
456 zfs_rs_set_start(rs, rt, start);
457 zfs_rs_set_end(rs, rt, end);
458 zfs_rs_set_fill(rs, rt, fill);
459 zfs_btree_add_idx(&rt->rt_root, rs, &where);
460 }
461
462 if (gap != 0) {
463 ASSERT3U(zfs_rs_get_fill(rs, rt), <=, zfs_rs_get_end(rs, rt) -
464 zfs_rs_get_start(rs, rt));
465 } else {
466 ASSERT3U(zfs_rs_get_fill(rs, rt), ==, zfs_rs_get_end(rs, rt) -
467 zfs_rs_get_start(rs, rt));
468 }
469
470 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
471 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
472
473 zfs_range_tree_stat_incr(rt, rs);
474 rt->rt_space += size + bridge_size;
475 }
476
477 void
zfs_range_tree_add(void * arg,uint64_t start,uint64_t size)478 zfs_range_tree_add(void *arg, uint64_t start, uint64_t size)
479 {
480 zfs_range_tree_add_impl(arg, start, size, size);
481 }
482
483 static void
zfs_range_tree_remove_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size,boolean_t do_fill)484 zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
485 boolean_t do_fill)
486 {
487 zfs_btree_index_t where;
488 zfs_range_seg_t *rs;
489 zfs_range_seg_max_t rsearch, rs_tmp;
490 uint64_t end = start + size;
491 uint64_t rstart, rend;
492 boolean_t left_over, right_over;
493
494 VERIFY3U(size, !=, 0);
495 VERIFY3U(size, <=, rt->rt_space);
496 if (rt->rt_type == ZFS_RANGE_SEG64)
497 ASSERT3U(start + size, >, start);
498
499 zfs_rs_set_start(&rsearch, rt, start);
500 zfs_rs_set_end(&rsearch, rt, end);
501 rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
502
503 /* Make sure we completely overlap with someone */
504 if (rs == NULL) {
505 zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
506 "from range tree (offset=%llx size=%llx)",
507 ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
508 return;
509 }
510
511 rstart = zfs_rs_get_start(rs, rt);
512 rend = zfs_rs_get_end(rs, rt);
513
514 /*
515 * Range trees with gap support must only remove complete segments
516 * from the tree. This allows us to maintain accurate fill accounting
517 * and to ensure that bridged sections are not leaked. If we need to
518 * remove less than the full segment, we can only adjust the fill count.
519 */
520 if (rt->rt_gap != 0) {
521 if (do_fill) {
522 if (zfs_rs_get_fill(rs, rt) == size) {
523 start = rstart;
524 end = rend;
525 size = end - start;
526 } else {
527 zfs_range_tree_adjust_fill(rt, rs, -size);
528 return;
529 }
530 } else if (rstart != start || rend != end) {
531 zfs_panic_recover("zfs: rt=%s: freeing partial segment "
532 "of gap tree (offset=%llx size=%llx) of "
533 "(offset=%llx size=%llx)",
534 ZFS_RT_NAME(rt),
535 (longlong_t)start, (longlong_t)size,
536 (longlong_t)rstart, (longlong_t)(rend - rstart));
537 return;
538 }
539 }
540
541 if (!(rstart <= start && rend >= end)) {
542 zfs_panic_recover("zfs: rt=%s: removing segment "
543 "(offset=%llx size=%llx) not completely overlapped by "
544 "existing one (offset=%llx size=%llx)",
545 ZFS_RT_NAME(rt),
546 (longlong_t)start, (longlong_t)size,
547 (longlong_t)rstart, (longlong_t)(rend - rstart));
548 return;
549 }
550
551 left_over = (rstart != start);
552 right_over = (rend != end);
553
554 zfs_range_tree_stat_decr(rt, rs);
555
556 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
557 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
558
559 if (left_over && right_over) {
560 zfs_range_seg_max_t newseg;
561 zfs_rs_set_start(&newseg, rt, end);
562 zfs_rs_set_end_raw(&newseg, rt, zfs_rs_get_end_raw(rs, rt));
563 zfs_rs_set_fill(&newseg, rt, zfs_rs_get_end(rs, rt) - end);
564 zfs_range_tree_stat_incr(rt, &newseg);
565
566 // This modifies the buffer already inside the range tree
567 zfs_rs_set_end(rs, rt, start);
568
569 zfs_rs_copy(rs, &rs_tmp, rt);
570 if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
571 zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
572 else
573 zfs_btree_add(&rt->rt_root, &newseg);
574
575 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
576 rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
577 } else if (left_over) {
578 // This modifies the buffer already inside the range tree
579 zfs_rs_set_end(rs, rt, start);
580 zfs_rs_copy(rs, &rs_tmp, rt);
581 } else if (right_over) {
582 // This modifies the buffer already inside the range tree
583 zfs_rs_set_start(rs, rt, end);
584 zfs_rs_copy(rs, &rs_tmp, rt);
585 } else {
586 zfs_btree_remove_idx(&rt->rt_root, &where);
587 rs = NULL;
588 }
589
590 if (rs != NULL) {
591 /*
592 * The fill of the leftover segment will always be equal to
593 * the size, since we do not support removing partial segments
594 * of range trees with gaps.
595 */
596 zfs_rs_set_fill_raw(rs, rt, zfs_rs_get_end_raw(rs, rt) -
597 zfs_rs_get_start_raw(rs, rt));
598 zfs_range_tree_stat_incr(rt, &rs_tmp);
599
600 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
601 rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
602 }
603
604 rt->rt_space -= size;
605 }
606
607 void
zfs_range_tree_remove(void * arg,uint64_t start,uint64_t size)608 zfs_range_tree_remove(void *arg, uint64_t start, uint64_t size)
609 {
610 zfs_range_tree_remove_impl(arg, start, size, B_FALSE);
611 }
612
613 void
zfs_range_tree_remove_fill(zfs_range_tree_t * rt,uint64_t start,uint64_t size)614 zfs_range_tree_remove_fill(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
615 {
616 zfs_range_tree_remove_impl(rt, start, size, B_TRUE);
617 }
618
619 void
zfs_range_tree_resize_segment(zfs_range_tree_t * rt,zfs_range_seg_t * rs,uint64_t newstart,uint64_t newsize)620 zfs_range_tree_resize_segment(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
621 uint64_t newstart, uint64_t newsize)
622 {
623 int64_t delta = newsize - (zfs_rs_get_end(rs, rt) -
624 zfs_rs_get_start(rs, rt));
625
626 zfs_range_tree_stat_decr(rt, rs);
627 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
628 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
629
630 zfs_rs_set_start(rs, rt, newstart);
631 zfs_rs_set_end(rs, rt, newstart + newsize);
632
633 zfs_range_tree_stat_incr(rt, rs);
634 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
635 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
636
637 rt->rt_space += delta;
638 }
639
640 static zfs_range_seg_t *
zfs_range_tree_find_impl(zfs_range_tree_t * rt,uint64_t start,uint64_t size)641 zfs_range_tree_find_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
642 {
643 zfs_range_seg_max_t rsearch;
644 uint64_t end = start + size;
645
646 VERIFY(size != 0);
647
648 zfs_rs_set_start(&rsearch, rt, start);
649 zfs_rs_set_end(&rsearch, rt, end);
650 return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
651 }
652
653 zfs_range_seg_t *
zfs_range_tree_find(zfs_range_tree_t * rt,uint64_t start,uint64_t size)654 zfs_range_tree_find(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
655 {
656 if (rt->rt_type == ZFS_RANGE_SEG64)
657 ASSERT3U(start + size, >, start);
658
659 zfs_range_seg_t *rs = zfs_range_tree_find_impl(rt, start, size);
660 if (rs != NULL && zfs_rs_get_start(rs, rt) <= start &&
661 zfs_rs_get_end(rs, rt) >= start + size) {
662 return (rs);
663 }
664 return (NULL);
665 }
666
667 void
zfs_range_tree_verify_not_present(zfs_range_tree_t * rt,uint64_t off,uint64_t size)668 zfs_range_tree_verify_not_present(zfs_range_tree_t *rt, uint64_t off,
669 uint64_t size)
670 {
671 zfs_range_seg_t *rs = zfs_range_tree_find(rt, off, size);
672 if (rs != NULL)
673 panic("segment already in tree; rs=%p", (void *)rs);
674 }
675
676 boolean_t
zfs_range_tree_contains(zfs_range_tree_t * rt,uint64_t start,uint64_t size)677 zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
678 {
679 return (zfs_range_tree_find(rt, start, size) != NULL);
680 }
681
682 /*
683 * Returns the first subset of the given range which overlaps with the range
684 * tree. Returns true if there is a segment in the range, and false if there
685 * isn't.
686 */
687 boolean_t
zfs_range_tree_find_in(zfs_range_tree_t * rt,uint64_t start,uint64_t size,uint64_t * ostart,uint64_t * osize)688 zfs_range_tree_find_in(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
689 uint64_t *ostart, uint64_t *osize)
690 {
691 if (rt->rt_type == ZFS_RANGE_SEG64)
692 ASSERT3U(start + size, >, start);
693
694 zfs_range_seg_max_t rsearch;
695 zfs_rs_set_start(&rsearch, rt, start);
696 zfs_rs_set_end_raw(&rsearch, rt, zfs_rs_get_start_raw(&rsearch, rt) +
697 1);
698
699 zfs_btree_index_t where;
700 zfs_range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
701 if (rs != NULL) {
702 *ostart = start;
703 *osize = MIN(size, zfs_rs_get_end(rs, rt) - start);
704 return (B_TRUE);
705 }
706
707 rs = zfs_btree_next(&rt->rt_root, &where, &where);
708 if (rs == NULL || zfs_rs_get_start(rs, rt) >= start + size)
709 return (B_FALSE);
710
711 *ostart = zfs_rs_get_start(rs, rt);
712 *osize = MIN(start + size, zfs_rs_get_end(rs, rt)) -
713 zfs_rs_get_start(rs, rt);
714 return (B_TRUE);
715 }
716
717 /*
718 * Ensure that this range is not in the tree, regardless of whether
719 * it is currently in the tree.
720 */
721 void
zfs_range_tree_clear(zfs_range_tree_t * rt,uint64_t start,uint64_t size)722 zfs_range_tree_clear(zfs_range_tree_t *rt, uint64_t start, uint64_t size)
723 {
724 zfs_range_seg_t *rs;
725
726 if (size == 0)
727 return;
728
729 if (rt->rt_type == ZFS_RANGE_SEG64)
730 ASSERT3U(start + size, >, start);
731
732 while ((rs = zfs_range_tree_find_impl(rt, start, size)) != NULL) {
733 uint64_t free_start = MAX(zfs_rs_get_start(rs, rt), start);
734 uint64_t free_end = MIN(zfs_rs_get_end(rs, rt), start + size);
735 zfs_range_tree_remove(rt, free_start, free_end - free_start);
736 }
737 }
738
739 void
zfs_range_tree_swap(zfs_range_tree_t ** rtsrc,zfs_range_tree_t ** rtdst)740 zfs_range_tree_swap(zfs_range_tree_t **rtsrc, zfs_range_tree_t **rtdst)
741 {
742 zfs_range_tree_t *rt;
743
744 ASSERT0(zfs_range_tree_space(*rtdst));
745 ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
746
747 rt = *rtsrc;
748 *rtsrc = *rtdst;
749 *rtdst = rt;
750 }
751
752 void
zfs_range_tree_vacate(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)753 zfs_range_tree_vacate(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
754 void *arg)
755 {
756 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
757 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
758
759 if (func != NULL) {
760 zfs_range_seg_t *rs;
761 zfs_btree_index_t *cookie = NULL;
762
763 while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
764 NULL) {
765 func(arg, zfs_rs_get_start(rs, rt),
766 zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt));
767 }
768 } else {
769 zfs_btree_clear(&rt->rt_root);
770 }
771
772 memset(rt->rt_histogram, 0, sizeof (rt->rt_histogram));
773 rt->rt_space = 0;
774 }
775
776 void
zfs_range_tree_walk(zfs_range_tree_t * rt,zfs_range_tree_func_t * func,void * arg)777 zfs_range_tree_walk(zfs_range_tree_t *rt, zfs_range_tree_func_t *func,
778 void *arg)
779 {
780 zfs_btree_index_t where;
781 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
782 rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
783 func(arg, zfs_rs_get_start(rs, rt), zfs_rs_get_end(rs, rt) -
784 zfs_rs_get_start(rs, rt));
785 }
786 }
787
788 zfs_range_seg_t *
zfs_range_tree_first(zfs_range_tree_t * rt)789 zfs_range_tree_first(zfs_range_tree_t *rt)
790 {
791 return (zfs_btree_first(&rt->rt_root, NULL));
792 }
793
794 uint64_t
zfs_range_tree_space(zfs_range_tree_t * rt)795 zfs_range_tree_space(zfs_range_tree_t *rt)
796 {
797 return (rt->rt_space);
798 }
799
800 uint64_t
zfs_range_tree_numsegs(zfs_range_tree_t * rt)801 zfs_range_tree_numsegs(zfs_range_tree_t *rt)
802 {
803 return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
804 }
805
806 boolean_t
zfs_range_tree_is_empty(zfs_range_tree_t * rt)807 zfs_range_tree_is_empty(zfs_range_tree_t *rt)
808 {
809 ASSERT(rt != NULL);
810 return (zfs_range_tree_space(rt) == 0);
811 }
812
813 /*
814 * Remove any overlapping ranges between the given segment [start, end)
815 * from removefrom. Add non-overlapping leftovers to addto.
816 */
817 void
zfs_range_tree_remove_xor_add_segment(uint64_t start,uint64_t end,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)818 zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
819 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
820 {
821 zfs_btree_index_t where;
822 zfs_range_seg_max_t starting_rs;
823 zfs_rs_set_start(&starting_rs, removefrom, start);
824 zfs_rs_set_end_raw(&starting_rs, removefrom,
825 zfs_rs_get_start_raw(&starting_rs, removefrom) + 1);
826
827 zfs_range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
828 &starting_rs, &where);
829
830 if (curr == NULL)
831 curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
832
833 zfs_range_seg_t *next;
834 for (; curr != NULL; curr = next) {
835 if (start == end)
836 return;
837 VERIFY3U(start, <, end);
838
839 /* there is no overlap */
840 if (end <= zfs_rs_get_start(curr, removefrom)) {
841 zfs_range_tree_add(addto, start, end - start);
842 return;
843 }
844
845 uint64_t overlap_start = MAX(zfs_rs_get_start(curr, removefrom),
846 start);
847 uint64_t overlap_end = MIN(zfs_rs_get_end(curr, removefrom),
848 end);
849 uint64_t overlap_size = overlap_end - overlap_start;
850 ASSERT3S(overlap_size, >, 0);
851 zfs_range_seg_max_t rs;
852 zfs_rs_copy(curr, &rs, removefrom);
853
854 zfs_range_tree_remove(removefrom, overlap_start, overlap_size);
855
856 if (start < overlap_start)
857 zfs_range_tree_add(addto, start, overlap_start - start);
858
859 start = overlap_end;
860 next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
861 /*
862 * If we find something here, we only removed part of the
863 * curr segment. Either there's some left at the end
864 * because we've reached the end of the range we're removing,
865 * or there's some left at the start because we started
866 * partway through the range. Either way, we continue with
867 * the loop. If it's the former, we'll return at the start of
868 * the loop, and if it's the latter we'll see if there is more
869 * area to process.
870 */
871 if (next != NULL) {
872 ASSERT(start == end || start == zfs_rs_get_end(&rs,
873 removefrom));
874 }
875
876 next = zfs_btree_next(&removefrom->rt_root, &where, &where);
877 }
878 VERIFY0P(curr);
879
880 if (start != end) {
881 VERIFY3U(start, <, end);
882 zfs_range_tree_add(addto, start, end - start);
883 } else {
884 VERIFY3U(start, ==, end);
885 }
886 }
887
888 /*
889 * For each entry in rt, if it exists in removefrom, remove it
890 * from removefrom. Otherwise, add it to addto.
891 */
892 void
zfs_range_tree_remove_xor_add(zfs_range_tree_t * rt,zfs_range_tree_t * removefrom,zfs_range_tree_t * addto)893 zfs_range_tree_remove_xor_add(zfs_range_tree_t *rt,
894 zfs_range_tree_t *removefrom, zfs_range_tree_t *addto)
895 {
896 zfs_btree_index_t where;
897 for (zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
898 rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
899 zfs_range_tree_remove_xor_add_segment(zfs_rs_get_start(rs, rt),
900 zfs_rs_get_end(rs, rt), removefrom, addto);
901 }
902 }
903
904 uint64_t
zfs_range_tree_min(zfs_range_tree_t * rt)905 zfs_range_tree_min(zfs_range_tree_t *rt)
906 {
907 zfs_range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
908 return (rs != NULL ? zfs_rs_get_start(rs, rt) : 0);
909 }
910
911 uint64_t
zfs_range_tree_max(zfs_range_tree_t * rt)912 zfs_range_tree_max(zfs_range_tree_t *rt)
913 {
914 zfs_range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
915 return (rs != NULL ? zfs_rs_get_end(rs, rt) : 0);
916 }
917
918 uint64_t
zfs_range_tree_span(zfs_range_tree_t * rt)919 zfs_range_tree_span(zfs_range_tree_t *rt)
920 {
921 return (zfs_range_tree_max(rt) - zfs_range_tree_min(rt));
922 }
923