13241b1d3SJoe Thornber /* 23241b1d3SJoe Thornber * Copyright (C) 2011 Red Hat, Inc. 33241b1d3SJoe Thornber * 43241b1d3SJoe Thornber * This file is released under the GPL. 53241b1d3SJoe Thornber */ 63241b1d3SJoe Thornber #ifndef _LINUX_DM_BTREE_H 73241b1d3SJoe Thornber #define _LINUX_DM_BTREE_H 83241b1d3SJoe Thornber 93241b1d3SJoe Thornber #include "dm-block-manager.h" 103241b1d3SJoe Thornber 113241b1d3SJoe Thornber struct dm_transaction_manager; 123241b1d3SJoe Thornber 133241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 143241b1d3SJoe Thornber 153241b1d3SJoe Thornber /* 163241b1d3SJoe Thornber * Annotations used to check on-disk metadata is handled as little-endian. 173241b1d3SJoe Thornber */ 183241b1d3SJoe Thornber #ifdef __CHECKER__ 193241b1d3SJoe Thornber # define __dm_written_to_disk(x) __releases(x) 203241b1d3SJoe Thornber # define __dm_reads_from_disk(x) __acquires(x) 213241b1d3SJoe Thornber # define __dm_bless_for_disk(x) __acquire(x) 223241b1d3SJoe Thornber # define __dm_unbless_for_disk(x) __release(x) 233241b1d3SJoe Thornber #else 243241b1d3SJoe Thornber # define __dm_written_to_disk(x) 253241b1d3SJoe Thornber # define __dm_reads_from_disk(x) 263241b1d3SJoe Thornber # define __dm_bless_for_disk(x) 273241b1d3SJoe Thornber # define __dm_unbless_for_disk(x) 283241b1d3SJoe Thornber #endif 293241b1d3SJoe Thornber 303241b1d3SJoe Thornber /*----------------------------------------------------------------*/ 313241b1d3SJoe Thornber 323241b1d3SJoe Thornber /* 333241b1d3SJoe Thornber * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized 343241b1d3SJoe Thornber * values. 353241b1d3SJoe Thornber */ 363241b1d3SJoe Thornber 373241b1d3SJoe Thornber /* 3883f0d77aSMasanari Iida * Information about the values stored within the btree. 393241b1d3SJoe Thornber */ 403241b1d3SJoe Thornber struct dm_btree_value_type { 413241b1d3SJoe Thornber void *context; 423241b1d3SJoe Thornber 433241b1d3SJoe Thornber /* 443241b1d3SJoe Thornber * The size in bytes of each value. 453241b1d3SJoe Thornber */ 463241b1d3SJoe Thornber uint32_t size; 473241b1d3SJoe Thornber 483241b1d3SJoe Thornber /* 493241b1d3SJoe Thornber * Any of these methods can be safely set to NULL if you do not 503241b1d3SJoe Thornber * need the corresponding feature. 513241b1d3SJoe Thornber */ 523241b1d3SJoe Thornber 533241b1d3SJoe Thornber /* 54*be500ed7SJoe Thornber * The btree is making a duplicate of a run of values, for instance 553241b1d3SJoe Thornber * because previously-shared btree nodes have now diverged. 563241b1d3SJoe Thornber * @value argument is the new copy that the copy function may modify. 573241b1d3SJoe Thornber * (Probably it just wants to increment a reference count 583241b1d3SJoe Thornber * somewhere.) This method is _not_ called for insertion of a new 593241b1d3SJoe Thornber * value: It is assumed the ref count is already 1. 603241b1d3SJoe Thornber */ 61*be500ed7SJoe Thornber void (*inc)(void *context, const void *value, unsigned count); 623241b1d3SJoe Thornber 633241b1d3SJoe Thornber /* 64*be500ed7SJoe Thornber * These values are being deleted. The btree takes care of freeing 653241b1d3SJoe Thornber * the memory pointed to by @value. Often the del function just 66*be500ed7SJoe Thornber * needs to decrement a reference counts somewhere. 673241b1d3SJoe Thornber */ 68*be500ed7SJoe Thornber void (*dec)(void *context, const void *value, unsigned count); 693241b1d3SJoe Thornber 703241b1d3SJoe Thornber /* 713241b1d3SJoe Thornber * A test for equality between two values. When a value is 723241b1d3SJoe Thornber * overwritten with a new one, the old one has the dec method 733241b1d3SJoe Thornber * called _unless_ the new and old value are deemed equal. 743241b1d3SJoe Thornber */ 75018cede9SMike Snitzer int (*equal)(void *context, const void *value1, const void *value2); 763241b1d3SJoe Thornber }; 773241b1d3SJoe Thornber 783241b1d3SJoe Thornber /* 793241b1d3SJoe Thornber * The shape and contents of a btree. 803241b1d3SJoe Thornber */ 813241b1d3SJoe Thornber struct dm_btree_info { 823241b1d3SJoe Thornber struct dm_transaction_manager *tm; 833241b1d3SJoe Thornber 843241b1d3SJoe Thornber /* 853241b1d3SJoe Thornber * Number of nested btrees. (Not the depth of a single tree.) 863241b1d3SJoe Thornber */ 873241b1d3SJoe Thornber unsigned levels; 883241b1d3SJoe Thornber struct dm_btree_value_type value_type; 893241b1d3SJoe Thornber }; 903241b1d3SJoe Thornber 913241b1d3SJoe Thornber /* 923241b1d3SJoe Thornber * Set up an empty tree. O(1). 933241b1d3SJoe Thornber */ 943241b1d3SJoe Thornber int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); 953241b1d3SJoe Thornber 963241b1d3SJoe Thornber /* 973241b1d3SJoe Thornber * Delete a tree. O(n) - this is the slow one! It can also block, so 983241b1d3SJoe Thornber * please don't call it on an IO path. 993241b1d3SJoe Thornber */ 1003241b1d3SJoe Thornber int dm_btree_del(struct dm_btree_info *info, dm_block_t root); 1013241b1d3SJoe Thornber 1023241b1d3SJoe Thornber /* 1033241b1d3SJoe Thornber * All the lookup functions return -ENODATA if the key cannot be found. 1043241b1d3SJoe Thornber */ 1053241b1d3SJoe Thornber 1063241b1d3SJoe Thornber /* 1073241b1d3SJoe Thornber * Tries to find a key that matches exactly. O(ln(n)) 1083241b1d3SJoe Thornber */ 1093241b1d3SJoe Thornber int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 1103241b1d3SJoe Thornber uint64_t *keys, void *value_le); 1113241b1d3SJoe Thornber 1123241b1d3SJoe Thornber /* 113993ceab9SJoe Thornber * Tries to find the first key where the bottom level key is >= to that 114993ceab9SJoe Thornber * given. Useful for skipping empty sections of the btree. 115993ceab9SJoe Thornber */ 116993ceab9SJoe Thornber int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 117993ceab9SJoe Thornber uint64_t *keys, uint64_t *rkey, void *value_le); 118993ceab9SJoe Thornber 119993ceab9SJoe Thornber /* 1203241b1d3SJoe Thornber * Insertion (or overwrite an existing value). O(ln(n)) 1213241b1d3SJoe Thornber */ 1223241b1d3SJoe Thornber int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 1233241b1d3SJoe Thornber uint64_t *keys, void *value, dm_block_t *new_root) 1243241b1d3SJoe Thornber __dm_written_to_disk(value); 1253241b1d3SJoe Thornber 1263241b1d3SJoe Thornber /* 1273241b1d3SJoe Thornber * A variant of insert that indicates whether it actually inserted or just 1283241b1d3SJoe Thornber * overwrote. Useful if you're keeping track of the number of entries in a 1293241b1d3SJoe Thornber * tree. 1303241b1d3SJoe Thornber */ 1313241b1d3SJoe Thornber int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 1323241b1d3SJoe Thornber uint64_t *keys, void *value, dm_block_t *new_root, 1333241b1d3SJoe Thornber int *inserted) 1343241b1d3SJoe Thornber __dm_written_to_disk(value); 1353241b1d3SJoe Thornber 1363241b1d3SJoe Thornber /* 1373241b1d3SJoe Thornber * Remove a key if present. This doesn't remove empty sub trees. Normally 1383241b1d3SJoe Thornber * subtrees represent a separate entity, like a snapshot map, so this is 1393241b1d3SJoe Thornber * correct behaviour. O(ln(n)). 1403241b1d3SJoe Thornber */ 1413241b1d3SJoe Thornber int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 1423241b1d3SJoe Thornber uint64_t *keys, dm_block_t *new_root); 1433241b1d3SJoe Thornber 1443241b1d3SJoe Thornber /* 145993ceab9SJoe Thornber * Removes a _contiguous_ run of values starting from 'keys' and not 146993ceab9SJoe Thornber * reaching keys2 (where keys2 is keys with the final key replaced with 147993ceab9SJoe Thornber * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be 148993ceab9SJoe Thornber * altered. 1494ec331c3SJoe Thornber */ 1504ec331c3SJoe Thornber int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 1514ec331c3SJoe Thornber uint64_t *keys, uint64_t end_key, 1524ec331c3SJoe Thornber dm_block_t *new_root, unsigned *nr_removed); 1534ec331c3SJoe Thornber 1544ec331c3SJoe Thornber /* 1553241b1d3SJoe Thornber * Returns < 0 on failure. Otherwise the number of key entries that have 1563241b1d3SJoe Thornber * been filled out. Remember trees can have zero entries, and as such have 157f164e690SJoe Thornber * no lowest key. 158f164e690SJoe Thornber */ 159f164e690SJoe Thornber int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 160f164e690SJoe Thornber uint64_t *result_keys); 161f164e690SJoe Thornber 162f164e690SJoe Thornber /* 163f164e690SJoe Thornber * Returns < 0 on failure. Otherwise the number of key entries that have 164f164e690SJoe Thornber * been filled out. Remember trees can have zero entries, and as such have 1653241b1d3SJoe Thornber * no highest key. 1663241b1d3SJoe Thornber */ 1673241b1d3SJoe Thornber int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 1683241b1d3SJoe Thornber uint64_t *result_keys); 1693241b1d3SJoe Thornber 1704e7f1f90SJoe Thornber /* 1714e7f1f90SJoe Thornber * Iterate through the a btree, calling fn() on each entry. 1724e7f1f90SJoe Thornber * It only works for single level trees and is internally recursive, so 1734e7f1f90SJoe Thornber * monitor stack usage carefully. 1744e7f1f90SJoe Thornber */ 1754e7f1f90SJoe Thornber int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 1764e7f1f90SJoe Thornber int (*fn)(void *context, uint64_t *keys, void *leaf), 1774e7f1f90SJoe Thornber void *context); 1784e7f1f90SJoe Thornber 1797d111c81SJoe Thornber 1807d111c81SJoe Thornber /*----------------------------------------------------------------*/ 1817d111c81SJoe Thornber 1827d111c81SJoe Thornber /* 1837d111c81SJoe Thornber * Cursor API. This does not follow the rolling lock convention. Since we 1847d111c81SJoe Thornber * know the order that values are required we can issue prefetches to speed 1857d111c81SJoe Thornber * up iteration. Use on a single level btree only. 1867d111c81SJoe Thornber */ 1877d111c81SJoe Thornber #define DM_BTREE_CURSOR_MAX_DEPTH 16 1887d111c81SJoe Thornber 1897d111c81SJoe Thornber struct cursor_node { 1907d111c81SJoe Thornber struct dm_block *b; 1917d111c81SJoe Thornber unsigned index; 1927d111c81SJoe Thornber }; 1937d111c81SJoe Thornber 1947d111c81SJoe Thornber struct dm_btree_cursor { 1957d111c81SJoe Thornber struct dm_btree_info *info; 1967d111c81SJoe Thornber dm_block_t root; 1977d111c81SJoe Thornber 1987d111c81SJoe Thornber bool prefetch_leaves; 1997d111c81SJoe Thornber unsigned depth; 2007d111c81SJoe Thornber struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH]; 2017d111c81SJoe Thornber }; 2027d111c81SJoe Thornber 2037d111c81SJoe Thornber /* 2047d111c81SJoe Thornber * Creates a fresh cursor. If prefetch_leaves is set then it is assumed 2057d111c81SJoe Thornber * the btree contains block indexes that will be prefetched. The cursor is 2067d111c81SJoe Thornber * quite large, so you probably don't want to put it on the stack. 2077d111c81SJoe Thornber */ 2087d111c81SJoe Thornber int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 2097d111c81SJoe Thornber bool prefetch_leaves, struct dm_btree_cursor *c); 2107d111c81SJoe Thornber void dm_btree_cursor_end(struct dm_btree_cursor *c); 2117d111c81SJoe Thornber int dm_btree_cursor_next(struct dm_btree_cursor *c); 2129b696229SJoe Thornber int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); 2137d111c81SJoe Thornber int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); 2147d111c81SJoe Thornber 2153241b1d3SJoe Thornber #endif /* _LINUX_DM_BTREE_H */ 216