13241b1d3SJoe Thornber /* 23241b1d3SJoe Thornber * Copyright (C) 2011 Red Hat, Inc. 33241b1d3SJoe Thornber * 43241b1d3SJoe Thornber * This file is released under the GPL. 53241b1d3SJoe Thornber */ 63241b1d3SJoe Thornber 73241b1d3SJoe Thornber #include "dm-btree-internal.h" 83241b1d3SJoe Thornber #include "dm-space-map.h" 93241b1d3SJoe Thornber #include "dm-transaction-manager.h" 103241b1d3SJoe Thornber 111944ce60SPaul Gortmaker #include <linux/export.h> 123241b1d3SJoe Thornber #include <linux/device-mapper.h> 133241b1d3SJoe Thornber 143241b1d3SJoe Thornber #define DM_MSG_PREFIX "btree" 153241b1d3SJoe Thornber 163241b1d3SJoe Thornber /*---------------------------------------------------------------- 173241b1d3SJoe Thornber * Array manipulation 183241b1d3SJoe Thornber *--------------------------------------------------------------*/ 193241b1d3SJoe Thornber static void memcpy_disk(void *dest, const void *src, size_t len) 203241b1d3SJoe Thornber __dm_written_to_disk(src) 213241b1d3SJoe Thornber { 223241b1d3SJoe Thornber memcpy(dest, src, len); 233241b1d3SJoe Thornber __dm_unbless_for_disk(src); 243241b1d3SJoe Thornber } 253241b1d3SJoe Thornber 263241b1d3SJoe Thornber static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 273241b1d3SJoe Thornber unsigned index, void *elt) 283241b1d3SJoe Thornber __dm_written_to_disk(elt) 293241b1d3SJoe Thornber { 303241b1d3SJoe Thornber if (index < nr_elts) 313241b1d3SJoe Thornber memmove(base + (elt_size * (index + 1)), 323241b1d3SJoe Thornber base + (elt_size * index), 333241b1d3SJoe Thornber (nr_elts - index) * elt_size); 343241b1d3SJoe Thornber 353241b1d3SJoe Thornber memcpy_disk(base + (elt_size * index), elt, elt_size); 363241b1d3SJoe Thornber } 373241b1d3SJoe Thornber 383241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 393241b1d3SJoe Thornber 403241b1d3SJoe Thornber /* makes the assumption that no two keys are the same. */ 41550929faSMikulas Patocka static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 423241b1d3SJoe Thornber { 433241b1d3SJoe Thornber int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 443241b1d3SJoe Thornber 453241b1d3SJoe Thornber while (hi - lo > 1) { 463241b1d3SJoe Thornber int mid = lo + ((hi - lo) / 2); 473241b1d3SJoe Thornber uint64_t mid_key = le64_to_cpu(n->keys[mid]); 483241b1d3SJoe Thornber 493241b1d3SJoe Thornber if (mid_key == key) 503241b1d3SJoe Thornber return mid; 513241b1d3SJoe Thornber 523241b1d3SJoe Thornber if (mid_key < key) 533241b1d3SJoe Thornber lo = mid; 543241b1d3SJoe Thornber else 553241b1d3SJoe Thornber hi = mid; 563241b1d3SJoe Thornber } 573241b1d3SJoe Thornber 583241b1d3SJoe Thornber return want_hi ? hi : lo; 593241b1d3SJoe Thornber } 603241b1d3SJoe Thornber 61550929faSMikulas Patocka int lower_bound(struct btree_node *n, uint64_t key) 623241b1d3SJoe Thornber { 633241b1d3SJoe Thornber return bsearch(n, key, 0); 643241b1d3SJoe Thornber } 653241b1d3SJoe Thornber 66993ceab9SJoe Thornber static int upper_bound(struct btree_node *n, uint64_t key) 67993ceab9SJoe Thornber { 68993ceab9SJoe Thornber return bsearch(n, key, 1); 69993ceab9SJoe Thornber } 70993ceab9SJoe Thornber 71550929faSMikulas Patocka void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 723241b1d3SJoe Thornber struct dm_btree_value_type *vt) 733241b1d3SJoe Thornber { 743241b1d3SJoe Thornber unsigned i; 753241b1d3SJoe Thornber uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 763241b1d3SJoe Thornber 773241b1d3SJoe Thornber if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 783241b1d3SJoe Thornber for (i = 0; i < nr_entries; i++) 793241b1d3SJoe Thornber dm_tm_inc(tm, value64(n, i)); 803241b1d3SJoe Thornber else if (vt->inc) 813241b1d3SJoe Thornber for (i = 0; i < nr_entries; i++) 82a3aefb39SJoe Thornber vt->inc(vt->context, value_ptr(n, i)); 833241b1d3SJoe Thornber } 843241b1d3SJoe Thornber 85550929faSMikulas Patocka static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 863241b1d3SJoe Thornber uint64_t key, void *value) 873241b1d3SJoe Thornber __dm_written_to_disk(value) 883241b1d3SJoe Thornber { 893241b1d3SJoe Thornber uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 903241b1d3SJoe Thornber __le64 key_le = cpu_to_le64(key); 913241b1d3SJoe Thornber 923241b1d3SJoe Thornber if (index > nr_entries || 933241b1d3SJoe Thornber index >= le32_to_cpu(node->header.max_entries)) { 943241b1d3SJoe Thornber DMERR("too many entries in btree node for insert"); 953241b1d3SJoe Thornber __dm_unbless_for_disk(value); 963241b1d3SJoe Thornber return -ENOMEM; 973241b1d3SJoe Thornber } 983241b1d3SJoe Thornber 993241b1d3SJoe Thornber __dm_bless_for_disk(&key_le); 1003241b1d3SJoe Thornber 1013241b1d3SJoe Thornber array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 1023241b1d3SJoe Thornber array_insert(value_base(node), value_size, nr_entries, index, value); 1033241b1d3SJoe Thornber node->header.nr_entries = cpu_to_le32(nr_entries + 1); 1043241b1d3SJoe Thornber 1053241b1d3SJoe Thornber return 0; 1063241b1d3SJoe Thornber } 1073241b1d3SJoe Thornber 1083241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 1093241b1d3SJoe Thornber 1103241b1d3SJoe Thornber /* 1113241b1d3SJoe Thornber * We want 3n entries (for some n). This works more nicely for repeated 1123241b1d3SJoe Thornber * insert remove loops than (2n + 1). 1133241b1d3SJoe Thornber */ 1143241b1d3SJoe Thornber static uint32_t calc_max_entries(size_t value_size, size_t block_size) 1153241b1d3SJoe Thornber { 1163241b1d3SJoe Thornber uint32_t total, n; 1173241b1d3SJoe Thornber size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 1183241b1d3SJoe Thornber 1193241b1d3SJoe Thornber block_size -= sizeof(struct node_header); 1203241b1d3SJoe Thornber total = block_size / elt_size; 1213241b1d3SJoe Thornber n = total / 3; /* rounds down */ 1223241b1d3SJoe Thornber 1233241b1d3SJoe Thornber return 3 * n; 1243241b1d3SJoe Thornber } 1253241b1d3SJoe Thornber 1263241b1d3SJoe Thornber int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 1273241b1d3SJoe Thornber { 1283241b1d3SJoe Thornber int r; 1293241b1d3SJoe Thornber struct dm_block *b; 130550929faSMikulas Patocka struct btree_node *n; 1313241b1d3SJoe Thornber size_t block_size; 1323241b1d3SJoe Thornber uint32_t max_entries; 1333241b1d3SJoe Thornber 1343241b1d3SJoe Thornber r = new_block(info, &b); 1353241b1d3SJoe Thornber if (r < 0) 1363241b1d3SJoe Thornber return r; 1373241b1d3SJoe Thornber 1383241b1d3SJoe Thornber block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 1393241b1d3SJoe Thornber max_entries = calc_max_entries(info->value_type.size, block_size); 1403241b1d3SJoe Thornber 1413241b1d3SJoe Thornber n = dm_block_data(b); 1423241b1d3SJoe Thornber memset(n, 0, block_size); 1433241b1d3SJoe Thornber n->header.flags = cpu_to_le32(LEAF_NODE); 1443241b1d3SJoe Thornber n->header.nr_entries = cpu_to_le32(0); 1453241b1d3SJoe Thornber n->header.max_entries = cpu_to_le32(max_entries); 1463241b1d3SJoe Thornber n->header.value_size = cpu_to_le32(info->value_type.size); 1473241b1d3SJoe Thornber 1483241b1d3SJoe Thornber *root = dm_block_location(b); 1494c7da06fSMikulas Patocka unlock_block(info, b); 1504c7da06fSMikulas Patocka 1514c7da06fSMikulas Patocka return 0; 1523241b1d3SJoe Thornber } 1533241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_empty); 1543241b1d3SJoe Thornber 1553241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 1563241b1d3SJoe Thornber 1573241b1d3SJoe Thornber /* 1583241b1d3SJoe Thornber * Deletion uses a recursive algorithm, since we have limited stack space 1593241b1d3SJoe Thornber * we explicitly manage our own stack on the heap. 1603241b1d3SJoe Thornber */ 1613241b1d3SJoe Thornber #define MAX_SPINE_DEPTH 64 1623241b1d3SJoe Thornber struct frame { 1633241b1d3SJoe Thornber struct dm_block *b; 164550929faSMikulas Patocka struct btree_node *n; 1653241b1d3SJoe Thornber unsigned level; 1663241b1d3SJoe Thornber unsigned nr_children; 1673241b1d3SJoe Thornber unsigned current_child; 1683241b1d3SJoe Thornber }; 1693241b1d3SJoe Thornber 1703241b1d3SJoe Thornber struct del_stack { 17104f17c80SJoe Thornber struct dm_btree_info *info; 1723241b1d3SJoe Thornber struct dm_transaction_manager *tm; 1733241b1d3SJoe Thornber int top; 1743241b1d3SJoe Thornber struct frame spine[MAX_SPINE_DEPTH]; 1753241b1d3SJoe Thornber }; 1763241b1d3SJoe Thornber 1773241b1d3SJoe Thornber static int top_frame(struct del_stack *s, struct frame **f) 1783241b1d3SJoe Thornber { 1793241b1d3SJoe Thornber if (s->top < 0) { 1803241b1d3SJoe Thornber DMERR("btree deletion stack empty"); 1813241b1d3SJoe Thornber return -EINVAL; 1823241b1d3SJoe Thornber } 1833241b1d3SJoe Thornber 1843241b1d3SJoe Thornber *f = s->spine + s->top; 1853241b1d3SJoe Thornber 1863241b1d3SJoe Thornber return 0; 1873241b1d3SJoe Thornber } 1883241b1d3SJoe Thornber 1893241b1d3SJoe Thornber static int unprocessed_frames(struct del_stack *s) 1903241b1d3SJoe Thornber { 1913241b1d3SJoe Thornber return s->top >= 0; 1923241b1d3SJoe Thornber } 1933241b1d3SJoe Thornber 19404f17c80SJoe Thornber static void prefetch_children(struct del_stack *s, struct frame *f) 19504f17c80SJoe Thornber { 19604f17c80SJoe Thornber unsigned i; 19704f17c80SJoe Thornber struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 19804f17c80SJoe Thornber 19904f17c80SJoe Thornber for (i = 0; i < f->nr_children; i++) 20004f17c80SJoe Thornber dm_bm_prefetch(bm, value64(f->n, i)); 20104f17c80SJoe Thornber } 20204f17c80SJoe Thornber 20304f17c80SJoe Thornber static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 20404f17c80SJoe Thornber { 20504f17c80SJoe Thornber return f->level < (info->levels - 1); 20604f17c80SJoe Thornber } 20704f17c80SJoe Thornber 2083241b1d3SJoe Thornber static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 2093241b1d3SJoe Thornber { 2103241b1d3SJoe Thornber int r; 2113241b1d3SJoe Thornber uint32_t ref_count; 2123241b1d3SJoe Thornber 2133241b1d3SJoe Thornber if (s->top >= MAX_SPINE_DEPTH - 1) { 2143241b1d3SJoe Thornber DMERR("btree deletion stack out of memory"); 2153241b1d3SJoe Thornber return -ENOMEM; 2163241b1d3SJoe Thornber } 2173241b1d3SJoe Thornber 2183241b1d3SJoe Thornber r = dm_tm_ref(s->tm, b, &ref_count); 2193241b1d3SJoe Thornber if (r) 2203241b1d3SJoe Thornber return r; 2213241b1d3SJoe Thornber 2223241b1d3SJoe Thornber if (ref_count > 1) 2233241b1d3SJoe Thornber /* 2243241b1d3SJoe Thornber * This is a shared node, so we can just decrement it's 2253241b1d3SJoe Thornber * reference counter and leave the children. 2263241b1d3SJoe Thornber */ 2273241b1d3SJoe Thornber dm_tm_dec(s->tm, b); 2283241b1d3SJoe Thornber 2293241b1d3SJoe Thornber else { 23004f17c80SJoe Thornber uint32_t flags; 2313241b1d3SJoe Thornber struct frame *f = s->spine + ++s->top; 2323241b1d3SJoe Thornber 2333241b1d3SJoe Thornber r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 2343241b1d3SJoe Thornber if (r) { 2353241b1d3SJoe Thornber s->top--; 2363241b1d3SJoe Thornber return r; 2373241b1d3SJoe Thornber } 2383241b1d3SJoe Thornber 2393241b1d3SJoe Thornber f->n = dm_block_data(f->b); 2403241b1d3SJoe Thornber f->level = level; 2413241b1d3SJoe Thornber f->nr_children = le32_to_cpu(f->n->header.nr_entries); 2423241b1d3SJoe Thornber f->current_child = 0; 24304f17c80SJoe Thornber 24404f17c80SJoe Thornber flags = le32_to_cpu(f->n->header.flags); 24504f17c80SJoe Thornber if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 24604f17c80SJoe Thornber prefetch_children(s, f); 2473241b1d3SJoe Thornber } 2483241b1d3SJoe Thornber 2493241b1d3SJoe Thornber return 0; 2503241b1d3SJoe Thornber } 2513241b1d3SJoe Thornber 2523241b1d3SJoe Thornber static void pop_frame(struct del_stack *s) 2533241b1d3SJoe Thornber { 2543241b1d3SJoe Thornber struct frame *f = s->spine + s->top--; 2553241b1d3SJoe Thornber 2563241b1d3SJoe Thornber dm_tm_dec(s->tm, dm_block_location(f->b)); 2573241b1d3SJoe Thornber dm_tm_unlock(s->tm, f->b); 2583241b1d3SJoe Thornber } 2593241b1d3SJoe Thornber 260ed8b45a3SJoe Thornber static void unlock_all_frames(struct del_stack *s) 261ed8b45a3SJoe Thornber { 262ed8b45a3SJoe Thornber struct frame *f; 263ed8b45a3SJoe Thornber 264ed8b45a3SJoe Thornber while (unprocessed_frames(s)) { 265ed8b45a3SJoe Thornber f = s->spine + s->top--; 266ed8b45a3SJoe Thornber dm_tm_unlock(s->tm, f->b); 267ed8b45a3SJoe Thornber } 268ed8b45a3SJoe Thornber } 269ed8b45a3SJoe Thornber 2703241b1d3SJoe Thornber int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 2713241b1d3SJoe Thornber { 2723241b1d3SJoe Thornber int r; 2733241b1d3SJoe Thornber struct del_stack *s; 2743241b1d3SJoe Thornber 2759f9ef065SJoe Thornber /* 2769f9ef065SJoe Thornber * dm_btree_del() is called via an ioctl, as such should be 2779f9ef065SJoe Thornber * considered an FS op. We can't recurse back into the FS, so we 2789f9ef065SJoe Thornber * allocate GFP_NOFS. 2799f9ef065SJoe Thornber */ 2809f9ef065SJoe Thornber s = kmalloc(sizeof(*s), GFP_NOFS); 2813241b1d3SJoe Thornber if (!s) 2823241b1d3SJoe Thornber return -ENOMEM; 28304f17c80SJoe Thornber s->info = info; 2843241b1d3SJoe Thornber s->tm = info->tm; 2853241b1d3SJoe Thornber s->top = -1; 2863241b1d3SJoe Thornber 287e3cbf945SJoe Thornber r = push_frame(s, root, 0); 2883241b1d3SJoe Thornber if (r) 2893241b1d3SJoe Thornber goto out; 2903241b1d3SJoe Thornber 2913241b1d3SJoe Thornber while (unprocessed_frames(s)) { 2923241b1d3SJoe Thornber uint32_t flags; 2933241b1d3SJoe Thornber struct frame *f; 2943241b1d3SJoe Thornber dm_block_t b; 2953241b1d3SJoe Thornber 2963241b1d3SJoe Thornber r = top_frame(s, &f); 2973241b1d3SJoe Thornber if (r) 2983241b1d3SJoe Thornber goto out; 2993241b1d3SJoe Thornber 3003241b1d3SJoe Thornber if (f->current_child >= f->nr_children) { 3013241b1d3SJoe Thornber pop_frame(s); 3023241b1d3SJoe Thornber continue; 3033241b1d3SJoe Thornber } 3043241b1d3SJoe Thornber 3053241b1d3SJoe Thornber flags = le32_to_cpu(f->n->header.flags); 3063241b1d3SJoe Thornber if (flags & INTERNAL_NODE) { 3073241b1d3SJoe Thornber b = value64(f->n, f->current_child); 3083241b1d3SJoe Thornber f->current_child++; 3093241b1d3SJoe Thornber r = push_frame(s, b, f->level); 3103241b1d3SJoe Thornber if (r) 3113241b1d3SJoe Thornber goto out; 3123241b1d3SJoe Thornber 313e3cbf945SJoe Thornber } else if (is_internal_level(info, f)) { 3143241b1d3SJoe Thornber b = value64(f->n, f->current_child); 3153241b1d3SJoe Thornber f->current_child++; 3163241b1d3SJoe Thornber r = push_frame(s, b, f->level + 1); 3173241b1d3SJoe Thornber if (r) 3183241b1d3SJoe Thornber goto out; 3193241b1d3SJoe Thornber 3203241b1d3SJoe Thornber } else { 3213241b1d3SJoe Thornber if (info->value_type.dec) { 3223241b1d3SJoe Thornber unsigned i; 3233241b1d3SJoe Thornber 3243241b1d3SJoe Thornber for (i = 0; i < f->nr_children; i++) 3253241b1d3SJoe Thornber info->value_type.dec(info->value_type.context, 326a3aefb39SJoe Thornber value_ptr(f->n, i)); 3273241b1d3SJoe Thornber } 328cd5acf0bSJoe Thornber pop_frame(s); 3293241b1d3SJoe Thornber } 3303241b1d3SJoe Thornber } 3313241b1d3SJoe Thornber out: 332ed8b45a3SJoe Thornber if (r) { 333ed8b45a3SJoe Thornber /* cleanup all frames of del_stack */ 334ed8b45a3SJoe Thornber unlock_all_frames(s); 335ed8b45a3SJoe Thornber } 3363241b1d3SJoe Thornber kfree(s); 337ed8b45a3SJoe Thornber 3383241b1d3SJoe Thornber return r; 3393241b1d3SJoe Thornber } 3403241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_del); 3413241b1d3SJoe Thornber 3423241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 3433241b1d3SJoe Thornber 3443241b1d3SJoe Thornber static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 345550929faSMikulas Patocka int (*search_fn)(struct btree_node *, uint64_t), 3463241b1d3SJoe Thornber uint64_t *result_key, void *v, size_t value_size) 3473241b1d3SJoe Thornber { 3483241b1d3SJoe Thornber int i, r; 3493241b1d3SJoe Thornber uint32_t flags, nr_entries; 3503241b1d3SJoe Thornber 3513241b1d3SJoe Thornber do { 3523241b1d3SJoe Thornber r = ro_step(s, block); 3533241b1d3SJoe Thornber if (r < 0) 3543241b1d3SJoe Thornber return r; 3553241b1d3SJoe Thornber 3563241b1d3SJoe Thornber i = search_fn(ro_node(s), key); 3573241b1d3SJoe Thornber 3583241b1d3SJoe Thornber flags = le32_to_cpu(ro_node(s)->header.flags); 3593241b1d3SJoe Thornber nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 3603241b1d3SJoe Thornber if (i < 0 || i >= nr_entries) 3613241b1d3SJoe Thornber return -ENODATA; 3623241b1d3SJoe Thornber 3633241b1d3SJoe Thornber if (flags & INTERNAL_NODE) 3643241b1d3SJoe Thornber block = value64(ro_node(s), i); 3653241b1d3SJoe Thornber 3663241b1d3SJoe Thornber } while (!(flags & LEAF_NODE)); 3673241b1d3SJoe Thornber 3683241b1d3SJoe Thornber *result_key = le64_to_cpu(ro_node(s)->keys[i]); 369a3aefb39SJoe Thornber memcpy(v, value_ptr(ro_node(s), i), value_size); 3703241b1d3SJoe Thornber 3713241b1d3SJoe Thornber return 0; 3723241b1d3SJoe Thornber } 3733241b1d3SJoe Thornber 3743241b1d3SJoe Thornber int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 3753241b1d3SJoe Thornber uint64_t *keys, void *value_le) 3763241b1d3SJoe Thornber { 3773241b1d3SJoe Thornber unsigned level, last_level = info->levels - 1; 3783241b1d3SJoe Thornber int r = -ENODATA; 3793241b1d3SJoe Thornber uint64_t rkey; 3803241b1d3SJoe Thornber __le64 internal_value_le; 3813241b1d3SJoe Thornber struct ro_spine spine; 3823241b1d3SJoe Thornber 3833241b1d3SJoe Thornber init_ro_spine(&spine, info); 3843241b1d3SJoe Thornber for (level = 0; level < info->levels; level++) { 3853241b1d3SJoe Thornber size_t size; 3863241b1d3SJoe Thornber void *value_p; 3873241b1d3SJoe Thornber 3883241b1d3SJoe Thornber if (level == last_level) { 3893241b1d3SJoe Thornber value_p = value_le; 3903241b1d3SJoe Thornber size = info->value_type.size; 3913241b1d3SJoe Thornber 3923241b1d3SJoe Thornber } else { 3933241b1d3SJoe Thornber value_p = &internal_value_le; 3943241b1d3SJoe Thornber size = sizeof(uint64_t); 3953241b1d3SJoe Thornber } 3963241b1d3SJoe Thornber 3973241b1d3SJoe Thornber r = btree_lookup_raw(&spine, root, keys[level], 3983241b1d3SJoe Thornber lower_bound, &rkey, 3993241b1d3SJoe Thornber value_p, size); 4003241b1d3SJoe Thornber 4013241b1d3SJoe Thornber if (!r) { 4023241b1d3SJoe Thornber if (rkey != keys[level]) { 4033241b1d3SJoe Thornber exit_ro_spine(&spine); 4043241b1d3SJoe Thornber return -ENODATA; 4053241b1d3SJoe Thornber } 4063241b1d3SJoe Thornber } else { 4073241b1d3SJoe Thornber exit_ro_spine(&spine); 4083241b1d3SJoe Thornber return r; 4093241b1d3SJoe Thornber } 4103241b1d3SJoe Thornber 4113241b1d3SJoe Thornber root = le64_to_cpu(internal_value_le); 4123241b1d3SJoe Thornber } 4133241b1d3SJoe Thornber exit_ro_spine(&spine); 4143241b1d3SJoe Thornber 4153241b1d3SJoe Thornber return r; 4163241b1d3SJoe Thornber } 4173241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_lookup); 4183241b1d3SJoe Thornber 419993ceab9SJoe Thornber static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 420993ceab9SJoe Thornber uint64_t key, uint64_t *rkey, void *value_le) 421993ceab9SJoe Thornber { 422993ceab9SJoe Thornber int r, i; 423993ceab9SJoe Thornber uint32_t flags, nr_entries; 424993ceab9SJoe Thornber struct dm_block *node; 425993ceab9SJoe Thornber struct btree_node *n; 426993ceab9SJoe Thornber 427993ceab9SJoe Thornber r = bn_read_lock(info, root, &node); 428993ceab9SJoe Thornber if (r) 429993ceab9SJoe Thornber return r; 430993ceab9SJoe Thornber 431993ceab9SJoe Thornber n = dm_block_data(node); 432993ceab9SJoe Thornber flags = le32_to_cpu(n->header.flags); 433993ceab9SJoe Thornber nr_entries = le32_to_cpu(n->header.nr_entries); 434993ceab9SJoe Thornber 435993ceab9SJoe Thornber if (flags & INTERNAL_NODE) { 436993ceab9SJoe Thornber i = lower_bound(n, key); 437e7e0f730SJoe Thornber if (i < 0) { 438e7e0f730SJoe Thornber /* 439e7e0f730SJoe Thornber * avoid early -ENODATA return when all entries are 440e7e0f730SJoe Thornber * higher than the search @key. 441e7e0f730SJoe Thornber */ 442e7e0f730SJoe Thornber i = 0; 443e7e0f730SJoe Thornber } 444e7e0f730SJoe Thornber if (i >= nr_entries) { 445993ceab9SJoe Thornber r = -ENODATA; 446993ceab9SJoe Thornber goto out; 447993ceab9SJoe Thornber } 448993ceab9SJoe Thornber 449993ceab9SJoe Thornber r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 450993ceab9SJoe Thornber if (r == -ENODATA && i < (nr_entries - 1)) { 451993ceab9SJoe Thornber i++; 452993ceab9SJoe Thornber r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 453993ceab9SJoe Thornber } 454993ceab9SJoe Thornber 455993ceab9SJoe Thornber } else { 456993ceab9SJoe Thornber i = upper_bound(n, key); 457993ceab9SJoe Thornber if (i < 0 || i >= nr_entries) { 458993ceab9SJoe Thornber r = -ENODATA; 459993ceab9SJoe Thornber goto out; 460993ceab9SJoe Thornber } 461993ceab9SJoe Thornber 462993ceab9SJoe Thornber *rkey = le64_to_cpu(n->keys[i]); 463993ceab9SJoe Thornber memcpy(value_le, value_ptr(n, i), info->value_type.size); 464993ceab9SJoe Thornber } 465993ceab9SJoe Thornber out: 466993ceab9SJoe Thornber dm_tm_unlock(info->tm, node); 467993ceab9SJoe Thornber return r; 468993ceab9SJoe Thornber } 469993ceab9SJoe Thornber 470993ceab9SJoe Thornber int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 471993ceab9SJoe Thornber uint64_t *keys, uint64_t *rkey, void *value_le) 472993ceab9SJoe Thornber { 473993ceab9SJoe Thornber unsigned level; 474993ceab9SJoe Thornber int r = -ENODATA; 475993ceab9SJoe Thornber __le64 internal_value_le; 476993ceab9SJoe Thornber struct ro_spine spine; 477993ceab9SJoe Thornber 478993ceab9SJoe Thornber init_ro_spine(&spine, info); 479993ceab9SJoe Thornber for (level = 0; level < info->levels - 1u; level++) { 480993ceab9SJoe Thornber r = btree_lookup_raw(&spine, root, keys[level], 481993ceab9SJoe Thornber lower_bound, rkey, 482993ceab9SJoe Thornber &internal_value_le, sizeof(uint64_t)); 483993ceab9SJoe Thornber if (r) 484993ceab9SJoe Thornber goto out; 485993ceab9SJoe Thornber 486993ceab9SJoe Thornber if (*rkey != keys[level]) { 487993ceab9SJoe Thornber r = -ENODATA; 488993ceab9SJoe Thornber goto out; 489993ceab9SJoe Thornber } 490993ceab9SJoe Thornber 491993ceab9SJoe Thornber root = le64_to_cpu(internal_value_le); 492993ceab9SJoe Thornber } 493993ceab9SJoe Thornber 494993ceab9SJoe Thornber r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 495993ceab9SJoe Thornber out: 496993ceab9SJoe Thornber exit_ro_spine(&spine); 497993ceab9SJoe Thornber return r; 498993ceab9SJoe Thornber } 499993ceab9SJoe Thornber 500993ceab9SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 501993ceab9SJoe Thornber 5023241b1d3SJoe Thornber /* 5033241b1d3SJoe Thornber * Splits a node by creating a sibling node and shifting half the nodes 5043241b1d3SJoe Thornber * contents across. Assumes there is a parent node, and it has room for 5053241b1d3SJoe Thornber * another child. 5063241b1d3SJoe Thornber * 5073241b1d3SJoe Thornber * Before: 5083241b1d3SJoe Thornber * +--------+ 5093241b1d3SJoe Thornber * | Parent | 5103241b1d3SJoe Thornber * +--------+ 5113241b1d3SJoe Thornber * | 5123241b1d3SJoe Thornber * v 5133241b1d3SJoe Thornber * +----------+ 5143241b1d3SJoe Thornber * | A ++++++ | 5153241b1d3SJoe Thornber * +----------+ 5163241b1d3SJoe Thornber * 5173241b1d3SJoe Thornber * 5183241b1d3SJoe Thornber * After: 5193241b1d3SJoe Thornber * +--------+ 5203241b1d3SJoe Thornber * | Parent | 5213241b1d3SJoe Thornber * +--------+ 5223241b1d3SJoe Thornber * | | 5233241b1d3SJoe Thornber * v +------+ 5243241b1d3SJoe Thornber * +---------+ | 5253241b1d3SJoe Thornber * | A* +++ | v 5263241b1d3SJoe Thornber * +---------+ +-------+ 5273241b1d3SJoe Thornber * | B +++ | 5283241b1d3SJoe Thornber * +-------+ 5293241b1d3SJoe Thornber * 5303241b1d3SJoe Thornber * Where A* is a shadow of A. 5313241b1d3SJoe Thornber */ 5320a8d4c3eSVivek Goyal static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, 5330a8d4c3eSVivek Goyal uint64_t key) 5343241b1d3SJoe Thornber { 5353241b1d3SJoe Thornber int r; 5363241b1d3SJoe Thornber size_t size; 5373241b1d3SJoe Thornber unsigned nr_left, nr_right; 5383241b1d3SJoe Thornber struct dm_block *left, *right, *parent; 539550929faSMikulas Patocka struct btree_node *ln, *rn, *pn; 5403241b1d3SJoe Thornber __le64 location; 5413241b1d3SJoe Thornber 5423241b1d3SJoe Thornber left = shadow_current(s); 5433241b1d3SJoe Thornber 5443241b1d3SJoe Thornber r = new_block(s->info, &right); 5453241b1d3SJoe Thornber if (r < 0) 5463241b1d3SJoe Thornber return r; 5473241b1d3SJoe Thornber 5483241b1d3SJoe Thornber ln = dm_block_data(left); 5493241b1d3SJoe Thornber rn = dm_block_data(right); 5503241b1d3SJoe Thornber 5513241b1d3SJoe Thornber nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 5523241b1d3SJoe Thornber nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 5533241b1d3SJoe Thornber 5543241b1d3SJoe Thornber ln->header.nr_entries = cpu_to_le32(nr_left); 5553241b1d3SJoe Thornber 5563241b1d3SJoe Thornber rn->header.flags = ln->header.flags; 5573241b1d3SJoe Thornber rn->header.nr_entries = cpu_to_le32(nr_right); 5583241b1d3SJoe Thornber rn->header.max_entries = ln->header.max_entries; 5593241b1d3SJoe Thornber rn->header.value_size = ln->header.value_size; 5603241b1d3SJoe Thornber memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 5613241b1d3SJoe Thornber 5623241b1d3SJoe Thornber size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 5633241b1d3SJoe Thornber sizeof(uint64_t) : s->info->value_type.size; 564a3aefb39SJoe Thornber memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), 5653241b1d3SJoe Thornber size * nr_right); 5663241b1d3SJoe Thornber 5673241b1d3SJoe Thornber /* 5683241b1d3SJoe Thornber * Patch up the parent 5693241b1d3SJoe Thornber */ 5703241b1d3SJoe Thornber parent = shadow_parent(s); 5713241b1d3SJoe Thornber 5723241b1d3SJoe Thornber pn = dm_block_data(parent); 5733241b1d3SJoe Thornber location = cpu_to_le64(dm_block_location(left)); 5743241b1d3SJoe Thornber __dm_bless_for_disk(&location); 575a3aefb39SJoe Thornber memcpy_disk(value_ptr(pn, parent_index), 5763241b1d3SJoe Thornber &location, sizeof(__le64)); 5773241b1d3SJoe Thornber 5783241b1d3SJoe Thornber location = cpu_to_le64(dm_block_location(right)); 5793241b1d3SJoe Thornber __dm_bless_for_disk(&location); 5803241b1d3SJoe Thornber 5813241b1d3SJoe Thornber r = insert_at(sizeof(__le64), pn, parent_index + 1, 5823241b1d3SJoe Thornber le64_to_cpu(rn->keys[0]), &location); 58330ce6e1cSMike Snitzer if (r) { 58430ce6e1cSMike Snitzer unlock_block(s->info, right); 5853241b1d3SJoe Thornber return r; 58630ce6e1cSMike Snitzer } 5873241b1d3SJoe Thornber 5883241b1d3SJoe Thornber if (key < le64_to_cpu(rn->keys[0])) { 5893241b1d3SJoe Thornber unlock_block(s->info, right); 5903241b1d3SJoe Thornber s->nodes[1] = left; 5913241b1d3SJoe Thornber } else { 5923241b1d3SJoe Thornber unlock_block(s->info, left); 5933241b1d3SJoe Thornber s->nodes[1] = right; 5943241b1d3SJoe Thornber } 5953241b1d3SJoe Thornber 5963241b1d3SJoe Thornber return 0; 5973241b1d3SJoe Thornber } 5983241b1d3SJoe Thornber 5993241b1d3SJoe Thornber /* 6003241b1d3SJoe Thornber * Splits a node by creating two new children beneath the given node. 6013241b1d3SJoe Thornber * 6023241b1d3SJoe Thornber * Before: 6033241b1d3SJoe Thornber * +----------+ 6043241b1d3SJoe Thornber * | A ++++++ | 6053241b1d3SJoe Thornber * +----------+ 6063241b1d3SJoe Thornber * 6073241b1d3SJoe Thornber * 6083241b1d3SJoe Thornber * After: 6093241b1d3SJoe Thornber * +------------+ 6103241b1d3SJoe Thornber * | A (shadow) | 6113241b1d3SJoe Thornber * +------------+ 6123241b1d3SJoe Thornber * | | 6133241b1d3SJoe Thornber * +------+ +----+ 6143241b1d3SJoe Thornber * | | 6153241b1d3SJoe Thornber * v v 6163241b1d3SJoe Thornber * +-------+ +-------+ 6173241b1d3SJoe Thornber * | B +++ | | C +++ | 6183241b1d3SJoe Thornber * +-------+ +-------+ 6193241b1d3SJoe Thornber */ 6203241b1d3SJoe Thornber static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 6213241b1d3SJoe Thornber { 6223241b1d3SJoe Thornber int r; 6233241b1d3SJoe Thornber size_t size; 6243241b1d3SJoe Thornber unsigned nr_left, nr_right; 6253241b1d3SJoe Thornber struct dm_block *left, *right, *new_parent; 626550929faSMikulas Patocka struct btree_node *pn, *ln, *rn; 6273241b1d3SJoe Thornber __le64 val; 6283241b1d3SJoe Thornber 6293241b1d3SJoe Thornber new_parent = shadow_current(s); 6303241b1d3SJoe Thornber 6313241b1d3SJoe Thornber r = new_block(s->info, &left); 6323241b1d3SJoe Thornber if (r < 0) 6333241b1d3SJoe Thornber return r; 6343241b1d3SJoe Thornber 6353241b1d3SJoe Thornber r = new_block(s->info, &right); 6363241b1d3SJoe Thornber if (r < 0) { 6374dcb8b57SMike Snitzer unlock_block(s->info, left); 6383241b1d3SJoe Thornber return r; 6393241b1d3SJoe Thornber } 6403241b1d3SJoe Thornber 6413241b1d3SJoe Thornber pn = dm_block_data(new_parent); 6423241b1d3SJoe Thornber ln = dm_block_data(left); 6433241b1d3SJoe Thornber rn = dm_block_data(right); 6443241b1d3SJoe Thornber 6453241b1d3SJoe Thornber nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 6463241b1d3SJoe Thornber nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 6473241b1d3SJoe Thornber 6483241b1d3SJoe Thornber ln->header.flags = pn->header.flags; 6493241b1d3SJoe Thornber ln->header.nr_entries = cpu_to_le32(nr_left); 6503241b1d3SJoe Thornber ln->header.max_entries = pn->header.max_entries; 6513241b1d3SJoe Thornber ln->header.value_size = pn->header.value_size; 6523241b1d3SJoe Thornber 6533241b1d3SJoe Thornber rn->header.flags = pn->header.flags; 6543241b1d3SJoe Thornber rn->header.nr_entries = cpu_to_le32(nr_right); 6553241b1d3SJoe Thornber rn->header.max_entries = pn->header.max_entries; 6563241b1d3SJoe Thornber rn->header.value_size = pn->header.value_size; 6573241b1d3SJoe Thornber 6583241b1d3SJoe Thornber memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 6593241b1d3SJoe Thornber memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 6603241b1d3SJoe Thornber 6613241b1d3SJoe Thornber size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 6623241b1d3SJoe Thornber sizeof(__le64) : s->info->value_type.size; 663a3aefb39SJoe Thornber memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 664a3aefb39SJoe Thornber memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 6653241b1d3SJoe Thornber nr_right * size); 6663241b1d3SJoe Thornber 6673241b1d3SJoe Thornber /* new_parent should just point to l and r now */ 6683241b1d3SJoe Thornber pn->header.flags = cpu_to_le32(INTERNAL_NODE); 6693241b1d3SJoe Thornber pn->header.nr_entries = cpu_to_le32(2); 6703241b1d3SJoe Thornber pn->header.max_entries = cpu_to_le32( 6713241b1d3SJoe Thornber calc_max_entries(sizeof(__le64), 6723241b1d3SJoe Thornber dm_bm_block_size( 6733241b1d3SJoe Thornber dm_tm_get_bm(s->info->tm)))); 6743241b1d3SJoe Thornber pn->header.value_size = cpu_to_le32(sizeof(__le64)); 6753241b1d3SJoe Thornber 6763241b1d3SJoe Thornber val = cpu_to_le64(dm_block_location(left)); 6773241b1d3SJoe Thornber __dm_bless_for_disk(&val); 6783241b1d3SJoe Thornber pn->keys[0] = ln->keys[0]; 679a3aefb39SJoe Thornber memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 6803241b1d3SJoe Thornber 6813241b1d3SJoe Thornber val = cpu_to_le64(dm_block_location(right)); 6823241b1d3SJoe Thornber __dm_bless_for_disk(&val); 6833241b1d3SJoe Thornber pn->keys[1] = rn->keys[0]; 684a3aefb39SJoe Thornber memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 6853241b1d3SJoe Thornber 6863241b1d3SJoe Thornber /* 6873241b1d3SJoe Thornber * rejig the spine. This is ugly, since it knows too 6883241b1d3SJoe Thornber * much about the spine 6893241b1d3SJoe Thornber */ 6903241b1d3SJoe Thornber if (s->nodes[0] != new_parent) { 6913241b1d3SJoe Thornber unlock_block(s->info, s->nodes[0]); 6923241b1d3SJoe Thornber s->nodes[0] = new_parent; 6933241b1d3SJoe Thornber } 6943241b1d3SJoe Thornber if (key < le64_to_cpu(rn->keys[0])) { 6953241b1d3SJoe Thornber unlock_block(s->info, right); 6963241b1d3SJoe Thornber s->nodes[1] = left; 6973241b1d3SJoe Thornber } else { 6983241b1d3SJoe Thornber unlock_block(s->info, left); 6993241b1d3SJoe Thornber s->nodes[1] = right; 7003241b1d3SJoe Thornber } 7013241b1d3SJoe Thornber s->count = 2; 7023241b1d3SJoe Thornber 7033241b1d3SJoe Thornber return 0; 7043241b1d3SJoe Thornber } 7053241b1d3SJoe Thornber 7063241b1d3SJoe Thornber static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 7073241b1d3SJoe Thornber struct dm_btree_value_type *vt, 7083241b1d3SJoe Thornber uint64_t key, unsigned *index) 7093241b1d3SJoe Thornber { 7103241b1d3SJoe Thornber int r, i = *index, top = 1; 711550929faSMikulas Patocka struct btree_node *node; 7123241b1d3SJoe Thornber 7133241b1d3SJoe Thornber for (;;) { 7143241b1d3SJoe Thornber r = shadow_step(s, root, vt); 7153241b1d3SJoe Thornber if (r < 0) 7163241b1d3SJoe Thornber return r; 7173241b1d3SJoe Thornber 7183241b1d3SJoe Thornber node = dm_block_data(shadow_current(s)); 7193241b1d3SJoe Thornber 7203241b1d3SJoe Thornber /* 7213241b1d3SJoe Thornber * We have to patch up the parent node, ugly, but I don't 7223241b1d3SJoe Thornber * see a way to do this automatically as part of the spine 7233241b1d3SJoe Thornber * op. 7243241b1d3SJoe Thornber */ 7253241b1d3SJoe Thornber if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 7263241b1d3SJoe Thornber __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 7273241b1d3SJoe Thornber 7283241b1d3SJoe Thornber __dm_bless_for_disk(&location); 729a3aefb39SJoe Thornber memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 7303241b1d3SJoe Thornber &location, sizeof(__le64)); 7313241b1d3SJoe Thornber } 7323241b1d3SJoe Thornber 7333241b1d3SJoe Thornber node = dm_block_data(shadow_current(s)); 7343241b1d3SJoe Thornber 7353241b1d3SJoe Thornber if (node->header.nr_entries == node->header.max_entries) { 7363241b1d3SJoe Thornber if (top) 7373241b1d3SJoe Thornber r = btree_split_beneath(s, key); 7383241b1d3SJoe Thornber else 7390a8d4c3eSVivek Goyal r = btree_split_sibling(s, i, key); 7403241b1d3SJoe Thornber 7413241b1d3SJoe Thornber if (r < 0) 7423241b1d3SJoe Thornber return r; 7433241b1d3SJoe Thornber } 7443241b1d3SJoe Thornber 7453241b1d3SJoe Thornber node = dm_block_data(shadow_current(s)); 7463241b1d3SJoe Thornber 7473241b1d3SJoe Thornber i = lower_bound(node, key); 7483241b1d3SJoe Thornber 7493241b1d3SJoe Thornber if (le32_to_cpu(node->header.flags) & LEAF_NODE) 7503241b1d3SJoe Thornber break; 7513241b1d3SJoe Thornber 7523241b1d3SJoe Thornber if (i < 0) { 7533241b1d3SJoe Thornber /* change the bounds on the lowest key */ 7543241b1d3SJoe Thornber node->keys[0] = cpu_to_le64(key); 7553241b1d3SJoe Thornber i = 0; 7563241b1d3SJoe Thornber } 7573241b1d3SJoe Thornber 7583241b1d3SJoe Thornber root = value64(node, i); 7593241b1d3SJoe Thornber top = 0; 7603241b1d3SJoe Thornber } 7613241b1d3SJoe Thornber 7623241b1d3SJoe Thornber if (i < 0 || le64_to_cpu(node->keys[i]) != key) 7633241b1d3SJoe Thornber i++; 7643241b1d3SJoe Thornber 7653241b1d3SJoe Thornber *index = i; 7663241b1d3SJoe Thornber return 0; 7673241b1d3SJoe Thornber } 7683241b1d3SJoe Thornber 769ba503835SMike Snitzer static bool need_insert(struct btree_node *node, uint64_t *keys, 770ba503835SMike Snitzer unsigned level, unsigned index) 771ba503835SMike Snitzer { 772ba503835SMike Snitzer return ((index >= le32_to_cpu(node->header.nr_entries)) || 773ba503835SMike Snitzer (le64_to_cpu(node->keys[index]) != keys[level])); 774ba503835SMike Snitzer } 775ba503835SMike Snitzer 7763241b1d3SJoe Thornber static int insert(struct dm_btree_info *info, dm_block_t root, 7773241b1d3SJoe Thornber uint64_t *keys, void *value, dm_block_t *new_root, 7783241b1d3SJoe Thornber int *inserted) 7793241b1d3SJoe Thornber __dm_written_to_disk(value) 7803241b1d3SJoe Thornber { 781ba503835SMike Snitzer int r; 7823241b1d3SJoe Thornber unsigned level, index = -1, last_level = info->levels - 1; 7833241b1d3SJoe Thornber dm_block_t block = root; 7843241b1d3SJoe Thornber struct shadow_spine spine; 785550929faSMikulas Patocka struct btree_node *n; 7863241b1d3SJoe Thornber struct dm_btree_value_type le64_type; 7873241b1d3SJoe Thornber 788b0dc3c8bSJoe Thornber init_le64_type(info->tm, &le64_type); 7893241b1d3SJoe Thornber init_shadow_spine(&spine, info); 7903241b1d3SJoe Thornber 7913241b1d3SJoe Thornber for (level = 0; level < (info->levels - 1); level++) { 7923241b1d3SJoe Thornber r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 7933241b1d3SJoe Thornber if (r < 0) 7943241b1d3SJoe Thornber goto bad; 7953241b1d3SJoe Thornber 7963241b1d3SJoe Thornber n = dm_block_data(shadow_current(&spine)); 7973241b1d3SJoe Thornber 798ba503835SMike Snitzer if (need_insert(n, keys, level, index)) { 7993241b1d3SJoe Thornber dm_block_t new_tree; 8003241b1d3SJoe Thornber __le64 new_le; 8013241b1d3SJoe Thornber 8023241b1d3SJoe Thornber r = dm_btree_empty(info, &new_tree); 8033241b1d3SJoe Thornber if (r < 0) 8043241b1d3SJoe Thornber goto bad; 8053241b1d3SJoe Thornber 8063241b1d3SJoe Thornber new_le = cpu_to_le64(new_tree); 8073241b1d3SJoe Thornber __dm_bless_for_disk(&new_le); 8083241b1d3SJoe Thornber 8093241b1d3SJoe Thornber r = insert_at(sizeof(uint64_t), n, index, 8103241b1d3SJoe Thornber keys[level], &new_le); 8113241b1d3SJoe Thornber if (r) 8123241b1d3SJoe Thornber goto bad; 8133241b1d3SJoe Thornber } 8143241b1d3SJoe Thornber 8153241b1d3SJoe Thornber if (level < last_level) 8163241b1d3SJoe Thornber block = value64(n, index); 8173241b1d3SJoe Thornber } 8183241b1d3SJoe Thornber 8193241b1d3SJoe Thornber r = btree_insert_raw(&spine, block, &info->value_type, 8203241b1d3SJoe Thornber keys[level], &index); 8213241b1d3SJoe Thornber if (r < 0) 8223241b1d3SJoe Thornber goto bad; 8233241b1d3SJoe Thornber 8243241b1d3SJoe Thornber n = dm_block_data(shadow_current(&spine)); 8253241b1d3SJoe Thornber 826ba503835SMike Snitzer if (need_insert(n, keys, level, index)) { 8273241b1d3SJoe Thornber if (inserted) 8283241b1d3SJoe Thornber *inserted = 1; 8293241b1d3SJoe Thornber 8303241b1d3SJoe Thornber r = insert_at(info->value_type.size, n, index, 8313241b1d3SJoe Thornber keys[level], value); 8323241b1d3SJoe Thornber if (r) 8333241b1d3SJoe Thornber goto bad_unblessed; 8343241b1d3SJoe Thornber } else { 8353241b1d3SJoe Thornber if (inserted) 8363241b1d3SJoe Thornber *inserted = 0; 8373241b1d3SJoe Thornber 8383241b1d3SJoe Thornber if (info->value_type.dec && 8393241b1d3SJoe Thornber (!info->value_type.equal || 8403241b1d3SJoe Thornber !info->value_type.equal( 8413241b1d3SJoe Thornber info->value_type.context, 842a3aefb39SJoe Thornber value_ptr(n, index), 8433241b1d3SJoe Thornber value))) { 8443241b1d3SJoe Thornber info->value_type.dec(info->value_type.context, 845a3aefb39SJoe Thornber value_ptr(n, index)); 8463241b1d3SJoe Thornber } 847a3aefb39SJoe Thornber memcpy_disk(value_ptr(n, index), 8483241b1d3SJoe Thornber value, info->value_type.size); 8493241b1d3SJoe Thornber } 8503241b1d3SJoe Thornber 8513241b1d3SJoe Thornber *new_root = shadow_root(&spine); 8523241b1d3SJoe Thornber exit_shadow_spine(&spine); 8533241b1d3SJoe Thornber 8543241b1d3SJoe Thornber return 0; 8553241b1d3SJoe Thornber 8563241b1d3SJoe Thornber bad: 8573241b1d3SJoe Thornber __dm_unbless_for_disk(value); 8583241b1d3SJoe Thornber bad_unblessed: 8593241b1d3SJoe Thornber exit_shadow_spine(&spine); 8603241b1d3SJoe Thornber return r; 8613241b1d3SJoe Thornber } 8623241b1d3SJoe Thornber 8633241b1d3SJoe Thornber int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 8643241b1d3SJoe Thornber uint64_t *keys, void *value, dm_block_t *new_root) 8653241b1d3SJoe Thornber __dm_written_to_disk(value) 8663241b1d3SJoe Thornber { 8673241b1d3SJoe Thornber return insert(info, root, keys, value, new_root, NULL); 8683241b1d3SJoe Thornber } 8693241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_insert); 8703241b1d3SJoe Thornber 8713241b1d3SJoe Thornber int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 8723241b1d3SJoe Thornber uint64_t *keys, void *value, dm_block_t *new_root, 8733241b1d3SJoe Thornber int *inserted) 8743241b1d3SJoe Thornber __dm_written_to_disk(value) 8753241b1d3SJoe Thornber { 8763241b1d3SJoe Thornber return insert(info, root, keys, value, new_root, inserted); 8773241b1d3SJoe Thornber } 8783241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 8793241b1d3SJoe Thornber 8803241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 8813241b1d3SJoe Thornber 882f164e690SJoe Thornber static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 8833241b1d3SJoe Thornber uint64_t *result_key, dm_block_t *next_block) 8843241b1d3SJoe Thornber { 8853241b1d3SJoe Thornber int i, r; 8863241b1d3SJoe Thornber uint32_t flags; 8873241b1d3SJoe Thornber 8883241b1d3SJoe Thornber do { 8893241b1d3SJoe Thornber r = ro_step(s, block); 8903241b1d3SJoe Thornber if (r < 0) 8913241b1d3SJoe Thornber return r; 8923241b1d3SJoe Thornber 8933241b1d3SJoe Thornber flags = le32_to_cpu(ro_node(s)->header.flags); 8943241b1d3SJoe Thornber i = le32_to_cpu(ro_node(s)->header.nr_entries); 8953241b1d3SJoe Thornber if (!i) 8963241b1d3SJoe Thornber return -ENODATA; 8973241b1d3SJoe Thornber else 8983241b1d3SJoe Thornber i--; 8993241b1d3SJoe Thornber 900f164e690SJoe Thornber if (find_highest) 9013241b1d3SJoe Thornber *result_key = le64_to_cpu(ro_node(s)->keys[i]); 902f164e690SJoe Thornber else 903f164e690SJoe Thornber *result_key = le64_to_cpu(ro_node(s)->keys[0]); 904f164e690SJoe Thornber 905*7d1fedb6SVinothkumar Raja if (next_block || flags & INTERNAL_NODE) { 906*7d1fedb6SVinothkumar Raja if (find_highest) 9073241b1d3SJoe Thornber block = value64(ro_node(s), i); 908*7d1fedb6SVinothkumar Raja else 909*7d1fedb6SVinothkumar Raja block = value64(ro_node(s), 0); 910*7d1fedb6SVinothkumar Raja } 9113241b1d3SJoe Thornber 9123241b1d3SJoe Thornber } while (flags & INTERNAL_NODE); 9133241b1d3SJoe Thornber 9143241b1d3SJoe Thornber if (next_block) 9153241b1d3SJoe Thornber *next_block = block; 9163241b1d3SJoe Thornber return 0; 9173241b1d3SJoe Thornber } 9183241b1d3SJoe Thornber 919f164e690SJoe Thornber static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 920f164e690SJoe Thornber bool find_highest, uint64_t *result_keys) 9213241b1d3SJoe Thornber { 9223241b1d3SJoe Thornber int r = 0, count = 0, level; 9233241b1d3SJoe Thornber struct ro_spine spine; 9243241b1d3SJoe Thornber 9253241b1d3SJoe Thornber init_ro_spine(&spine, info); 9263241b1d3SJoe Thornber for (level = 0; level < info->levels; level++) { 927f164e690SJoe Thornber r = find_key(&spine, root, find_highest, result_keys + level, 9283241b1d3SJoe Thornber level == info->levels - 1 ? NULL : &root); 9293241b1d3SJoe Thornber if (r == -ENODATA) { 9303241b1d3SJoe Thornber r = 0; 9313241b1d3SJoe Thornber break; 9323241b1d3SJoe Thornber 9333241b1d3SJoe Thornber } else if (r) 9343241b1d3SJoe Thornber break; 9353241b1d3SJoe Thornber 9363241b1d3SJoe Thornber count++; 9373241b1d3SJoe Thornber } 9383241b1d3SJoe Thornber exit_ro_spine(&spine); 9393241b1d3SJoe Thornber 9403241b1d3SJoe Thornber return r ? r : count; 9413241b1d3SJoe Thornber } 942f164e690SJoe Thornber 943f164e690SJoe Thornber int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 944f164e690SJoe Thornber uint64_t *result_keys) 945f164e690SJoe Thornber { 946f164e690SJoe Thornber return dm_btree_find_key(info, root, true, result_keys); 947f164e690SJoe Thornber } 9483241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 9494e7f1f90SJoe Thornber 950f164e690SJoe Thornber int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 951f164e690SJoe Thornber uint64_t *result_keys) 952f164e690SJoe Thornber { 953f164e690SJoe Thornber return dm_btree_find_key(info, root, false, result_keys); 954f164e690SJoe Thornber } 955f164e690SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 956f164e690SJoe Thornber 957f164e690SJoe Thornber /*----------------------------------------------------------------*/ 958f164e690SJoe Thornber 9594e7f1f90SJoe Thornber /* 9604e7f1f90SJoe Thornber * FIXME: We shouldn't use a recursive algorithm when we have limited stack 9614e7f1f90SJoe Thornber * space. Also this only works for single level trees. 9624e7f1f90SJoe Thornber */ 9639b460d36SJoe Thornber static int walk_node(struct dm_btree_info *info, dm_block_t block, 9644e7f1f90SJoe Thornber int (*fn)(void *context, uint64_t *keys, void *leaf), 9654e7f1f90SJoe Thornber void *context) 9664e7f1f90SJoe Thornber { 9674e7f1f90SJoe Thornber int r; 9684e7f1f90SJoe Thornber unsigned i, nr; 9699b460d36SJoe Thornber struct dm_block *node; 9704e7f1f90SJoe Thornber struct btree_node *n; 9714e7f1f90SJoe Thornber uint64_t keys; 9724e7f1f90SJoe Thornber 9739b460d36SJoe Thornber r = bn_read_lock(info, block, &node); 9749b460d36SJoe Thornber if (r) 9759b460d36SJoe Thornber return r; 9769b460d36SJoe Thornber 9779b460d36SJoe Thornber n = dm_block_data(node); 9784e7f1f90SJoe Thornber 9794e7f1f90SJoe Thornber nr = le32_to_cpu(n->header.nr_entries); 9804e7f1f90SJoe Thornber for (i = 0; i < nr; i++) { 9814e7f1f90SJoe Thornber if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 9829b460d36SJoe Thornber r = walk_node(info, value64(n, i), fn, context); 9834e7f1f90SJoe Thornber if (r) 9844e7f1f90SJoe Thornber goto out; 9854e7f1f90SJoe Thornber } else { 9864e7f1f90SJoe Thornber keys = le64_to_cpu(*key_ptr(n, i)); 9874e7f1f90SJoe Thornber r = fn(context, &keys, value_ptr(n, i)); 9884e7f1f90SJoe Thornber if (r) 9894e7f1f90SJoe Thornber goto out; 9904e7f1f90SJoe Thornber } 9914e7f1f90SJoe Thornber } 9924e7f1f90SJoe Thornber 9934e7f1f90SJoe Thornber out: 9949b460d36SJoe Thornber dm_tm_unlock(info->tm, node); 9954e7f1f90SJoe Thornber return r; 9964e7f1f90SJoe Thornber } 9974e7f1f90SJoe Thornber 9984e7f1f90SJoe Thornber int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 9994e7f1f90SJoe Thornber int (*fn)(void *context, uint64_t *keys, void *leaf), 10004e7f1f90SJoe Thornber void *context) 10014e7f1f90SJoe Thornber { 10024e7f1f90SJoe Thornber BUG_ON(info->levels > 1); 10039b460d36SJoe Thornber return walk_node(info, root, fn, context); 10044e7f1f90SJoe Thornber } 10054e7f1f90SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_walk); 10067d111c81SJoe Thornber 10077d111c81SJoe Thornber /*----------------------------------------------------------------*/ 10087d111c81SJoe Thornber 10097d111c81SJoe Thornber static void prefetch_values(struct dm_btree_cursor *c) 10107d111c81SJoe Thornber { 10117d111c81SJoe Thornber unsigned i, nr; 10127d111c81SJoe Thornber __le64 value_le; 10137d111c81SJoe Thornber struct cursor_node *n = c->nodes + c->depth - 1; 10147d111c81SJoe Thornber struct btree_node *bn = dm_block_data(n->b); 10157d111c81SJoe Thornber struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 10167d111c81SJoe Thornber 10177d111c81SJoe Thornber BUG_ON(c->info->value_type.size != sizeof(value_le)); 10187d111c81SJoe Thornber 10197d111c81SJoe Thornber nr = le32_to_cpu(bn->header.nr_entries); 10207d111c81SJoe Thornber for (i = 0; i < nr; i++) { 10217d111c81SJoe Thornber memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 10227d111c81SJoe Thornber dm_bm_prefetch(bm, le64_to_cpu(value_le)); 10237d111c81SJoe Thornber } 10247d111c81SJoe Thornber } 10257d111c81SJoe Thornber 10267d111c81SJoe Thornber static bool leaf_node(struct dm_btree_cursor *c) 10277d111c81SJoe Thornber { 10287d111c81SJoe Thornber struct cursor_node *n = c->nodes + c->depth - 1; 10297d111c81SJoe Thornber struct btree_node *bn = dm_block_data(n->b); 10307d111c81SJoe Thornber 10317d111c81SJoe Thornber return le32_to_cpu(bn->header.flags) & LEAF_NODE; 10327d111c81SJoe Thornber } 10337d111c81SJoe Thornber 10347d111c81SJoe Thornber static int push_node(struct dm_btree_cursor *c, dm_block_t b) 10357d111c81SJoe Thornber { 10367d111c81SJoe Thornber int r; 10377d111c81SJoe Thornber struct cursor_node *n = c->nodes + c->depth; 10387d111c81SJoe Thornber 10397d111c81SJoe Thornber if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 10407d111c81SJoe Thornber DMERR("couldn't push cursor node, stack depth too high"); 10417d111c81SJoe Thornber return -EINVAL; 10427d111c81SJoe Thornber } 10437d111c81SJoe Thornber 10447d111c81SJoe Thornber r = bn_read_lock(c->info, b, &n->b); 10457d111c81SJoe Thornber if (r) 10467d111c81SJoe Thornber return r; 10477d111c81SJoe Thornber 10487d111c81SJoe Thornber n->index = 0; 10497d111c81SJoe Thornber c->depth++; 10507d111c81SJoe Thornber 10517d111c81SJoe Thornber if (c->prefetch_leaves || !leaf_node(c)) 10527d111c81SJoe Thornber prefetch_values(c); 10537d111c81SJoe Thornber 10547d111c81SJoe Thornber return 0; 10557d111c81SJoe Thornber } 10567d111c81SJoe Thornber 10577d111c81SJoe Thornber static void pop_node(struct dm_btree_cursor *c) 10587d111c81SJoe Thornber { 10597d111c81SJoe Thornber c->depth--; 10607d111c81SJoe Thornber unlock_block(c->info, c->nodes[c->depth].b); 10617d111c81SJoe Thornber } 10627d111c81SJoe Thornber 10637d111c81SJoe Thornber static int inc_or_backtrack(struct dm_btree_cursor *c) 10647d111c81SJoe Thornber { 10657d111c81SJoe Thornber struct cursor_node *n; 10667d111c81SJoe Thornber struct btree_node *bn; 10677d111c81SJoe Thornber 10687d111c81SJoe Thornber for (;;) { 10697d111c81SJoe Thornber if (!c->depth) 10707d111c81SJoe Thornber return -ENODATA; 10717d111c81SJoe Thornber 10727d111c81SJoe Thornber n = c->nodes + c->depth - 1; 10737d111c81SJoe Thornber bn = dm_block_data(n->b); 10747d111c81SJoe Thornber 10757d111c81SJoe Thornber n->index++; 10767d111c81SJoe Thornber if (n->index < le32_to_cpu(bn->header.nr_entries)) 10777d111c81SJoe Thornber break; 10787d111c81SJoe Thornber 10797d111c81SJoe Thornber pop_node(c); 10807d111c81SJoe Thornber } 10817d111c81SJoe Thornber 10827d111c81SJoe Thornber return 0; 10837d111c81SJoe Thornber } 10847d111c81SJoe Thornber 10857d111c81SJoe Thornber static int find_leaf(struct dm_btree_cursor *c) 10867d111c81SJoe Thornber { 10877d111c81SJoe Thornber int r = 0; 10887d111c81SJoe Thornber struct cursor_node *n; 10897d111c81SJoe Thornber struct btree_node *bn; 10907d111c81SJoe Thornber __le64 value_le; 10917d111c81SJoe Thornber 10927d111c81SJoe Thornber for (;;) { 10937d111c81SJoe Thornber n = c->nodes + c->depth - 1; 10947d111c81SJoe Thornber bn = dm_block_data(n->b); 10957d111c81SJoe Thornber 10967d111c81SJoe Thornber if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 10977d111c81SJoe Thornber break; 10987d111c81SJoe Thornber 10997d111c81SJoe Thornber memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 11007d111c81SJoe Thornber r = push_node(c, le64_to_cpu(value_le)); 11017d111c81SJoe Thornber if (r) { 11027d111c81SJoe Thornber DMERR("push_node failed"); 11037d111c81SJoe Thornber break; 11047d111c81SJoe Thornber } 11057d111c81SJoe Thornber } 11067d111c81SJoe Thornber 11077d111c81SJoe Thornber if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 11087d111c81SJoe Thornber return -ENODATA; 11097d111c81SJoe Thornber 11107d111c81SJoe Thornber return r; 11117d111c81SJoe Thornber } 11127d111c81SJoe Thornber 11137d111c81SJoe Thornber int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 11147d111c81SJoe Thornber bool prefetch_leaves, struct dm_btree_cursor *c) 11157d111c81SJoe Thornber { 11167d111c81SJoe Thornber int r; 11177d111c81SJoe Thornber 11187d111c81SJoe Thornber c->info = info; 11197d111c81SJoe Thornber c->root = root; 11207d111c81SJoe Thornber c->depth = 0; 11217d111c81SJoe Thornber c->prefetch_leaves = prefetch_leaves; 11227d111c81SJoe Thornber 11237d111c81SJoe Thornber r = push_node(c, root); 11247d111c81SJoe Thornber if (r) 11257d111c81SJoe Thornber return r; 11267d111c81SJoe Thornber 11277d111c81SJoe Thornber return find_leaf(c); 11287d111c81SJoe Thornber } 11297d111c81SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 11307d111c81SJoe Thornber 11317d111c81SJoe Thornber void dm_btree_cursor_end(struct dm_btree_cursor *c) 11327d111c81SJoe Thornber { 11337d111c81SJoe Thornber while (c->depth) 11347d111c81SJoe Thornber pop_node(c); 11357d111c81SJoe Thornber } 11367d111c81SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 11377d111c81SJoe Thornber 11387d111c81SJoe Thornber int dm_btree_cursor_next(struct dm_btree_cursor *c) 11397d111c81SJoe Thornber { 11407d111c81SJoe Thornber int r = inc_or_backtrack(c); 11417d111c81SJoe Thornber if (!r) { 11427d111c81SJoe Thornber r = find_leaf(c); 11437d111c81SJoe Thornber if (r) 11447d111c81SJoe Thornber DMERR("find_leaf failed"); 11457d111c81SJoe Thornber } 11467d111c81SJoe Thornber 11477d111c81SJoe Thornber return r; 11487d111c81SJoe Thornber } 11497d111c81SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 11507d111c81SJoe Thornber 11519b696229SJoe Thornber int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 11529b696229SJoe Thornber { 11539b696229SJoe Thornber int r = 0; 11549b696229SJoe Thornber 11559b696229SJoe Thornber while (count-- && !r) 11569b696229SJoe Thornber r = dm_btree_cursor_next(c); 11579b696229SJoe Thornber 11589b696229SJoe Thornber return r; 11599b696229SJoe Thornber } 11609b696229SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 11619b696229SJoe Thornber 11627d111c81SJoe Thornber int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 11637d111c81SJoe Thornber { 11647d111c81SJoe Thornber if (c->depth) { 11657d111c81SJoe Thornber struct cursor_node *n = c->nodes + c->depth - 1; 11667d111c81SJoe Thornber struct btree_node *bn = dm_block_data(n->b); 11677d111c81SJoe Thornber 11687d111c81SJoe Thornber if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 11697d111c81SJoe Thornber return -EINVAL; 11707d111c81SJoe Thornber 11717d111c81SJoe Thornber *key = le64_to_cpu(*key_ptr(bn, n->index)); 11727d111c81SJoe Thornber memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 11737d111c81SJoe Thornber return 0; 11747d111c81SJoe Thornber 11757d111c81SJoe Thornber } else 11767d111c81SJoe Thornber return -ENODATA; 11777d111c81SJoe Thornber } 11787d111c81SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1179