xref: /linux/drivers/md/dm-vdo/indexer/delta-index.c (revision a5f998094fa344cdd1342164948abb4d7c6101ce)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 #include "delta-index.h"
6 
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/compiler.h>
10 #include <linux/limits.h>
11 #include <linux/log2.h>
12 
13 #include "cpu.h"
14 #include "errors.h"
15 #include "logger.h"
16 #include "memory-alloc.h"
17 #include "numeric.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 #include "time-utils.h"
21 
22 #include "config.h"
23 #include "indexer.h"
24 
25 /*
26  * The entries in a delta index could be stored in a single delta list, but to reduce search times
27  * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
28  * memory managed by the delta_zone structure. The delta_zone can move the data around within its
29  * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
30  * the volume index can contain over a million delta lists, we want to be efficient with the size
31  * of the delta list header information. This information is encoded into 16 bytes per list. The
32  * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
33  * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
34  * sufficient to store the size of a delta list.
35  *
36  * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
37  * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
38  * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
39  * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
40  * number corresponds to its index in the array.
41  *
42  * A standard delta list entry is stored as a fixed length payload (the value) followed by a
43  * variable length key (the delta). A collision entry is used when two block names have the same
44  * delta list address. A collision entry always follows a standard entry for the hash with which it
45  * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
46  * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
47  * indicates a normal entry.
48  *
49  * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
50  * used by small deltas. The Huffman code is specified by three parameters, which can be computed
51  * from the desired mean delta when the index is full. (See compute_coding_constants() for
52  * details.)
53  *
54  * The bit field utilities used to read and write delta entries assume that it is possible to read
55  * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
56  * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
57  * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
58  * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
59  * delta list could cause this step to run off the end of the delta_zone memory, so as extra
60  * protection against this happening, the tail guard list is set to all ones.
61  *
62  * The delta_index supports two different forms. The mutable form is created by
63  * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
64  * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
65  * cached) chapter index pages. The immutable form does not allocate delta list headers or
66  * temporary offsets, and thus is somewhat more memory efficient.
67  */
68 
69 /*
70  * This is the largest field size supported by get_field() and set_field(). Any field that is
71  * larger is not guaranteed to fit in a single byte-aligned u32.
72  */
73 #define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
74 
75 /*
76  * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
77  * is larger is not guaranteed to fit in a single byte-aligned u64.
78  */
79 #define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
80 
81 /*
82  * This is the number of guard bytes needed at the end of the memory byte array when using the bit
83  * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
84  * bytes beyond the end of the desired field. The definition is written to make it clear how this
85  * value is derived.
86  */
87 #define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
88 
89 /* The number of guard bits that are needed in the tail guard list */
90 #define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
91 
92 /*
93  * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
94  * buffer of this size can be used with move_bits().
95  */
96 #define DELTA_LIST_MAX_BYTE_COUNT					\
97 	((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
98 
99 /* The number of extra bytes and bits needed to store a collision entry */
100 #define COLLISION_BYTES UDS_RECORD_NAME_SIZE
101 #define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
102 
103 /*
104  * Immutable delta lists are packed into pages containing a header that encodes the delta list
105  * information into 19 bits per list (64KB bit offset).
106  */
107 #define IMMUTABLE_HEADER_SIZE 19
108 
109 /*
110  * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
111  * number to increment when the format of the data changes.
112  */
113 #define MAGIC_SIZE 8
114 
115 static const char DELTA_INDEX_MAGIC[] = "DI-00002";
116 
117 struct delta_index_header {
118 	char magic[MAGIC_SIZE];
119 	u32 zone_number;
120 	u32 zone_count;
121 	u32 first_list;
122 	u32 list_count;
123 	u64 record_count;
124 	u64 collision_count;
125 };
126 
127 /*
128  * Header data used for immutable delta index pages. This data is followed by the delta list offset
129  * table.
130  */
131 struct delta_page_header {
132 	/* Externally-defined nonce */
133 	u64 nonce;
134 	/* The virtual chapter number */
135 	u64 virtual_chapter_number;
136 	/* Index of the first delta list on the page */
137 	u16 first_list;
138 	/* Number of delta lists on the page */
139 	u16 list_count;
140 } __packed;
141 
get_delta_list_byte_start(const struct delta_list * delta_list)142 static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
143 {
144 	return delta_list->start / BITS_PER_BYTE;
145 }
146 
get_delta_list_byte_size(const struct delta_list * delta_list)147 static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
148 {
149 	unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
150 
151 	return BITS_TO_BYTES(bit_offset + delta_list->size);
152 }
153 
rebalance_delta_zone(const struct delta_zone * delta_zone,u32 first,u32 last)154 static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
155 				 u32 last)
156 {
157 	struct delta_list *delta_list;
158 	u64 new_start;
159 
160 	if (first == last) {
161 		/* Only one list is moving, and we know there is space. */
162 		delta_list = &delta_zone->delta_lists[first];
163 		new_start = delta_zone->new_offsets[first];
164 		if (delta_list->start != new_start) {
165 			u64 source;
166 			u64 destination;
167 
168 			source = get_delta_list_byte_start(delta_list);
169 			delta_list->start = new_start;
170 			destination = get_delta_list_byte_start(delta_list);
171 			memmove(delta_zone->memory + destination,
172 				delta_zone->memory + source,
173 				get_delta_list_byte_size(delta_list));
174 		}
175 	} else {
176 		/*
177 		 * There is more than one list. Divide the problem in half, and use recursive calls
178 		 * to process each half. Note that after this computation, first <= middle, and
179 		 * middle < last.
180 		 */
181 		u32 middle = (first + last) / 2;
182 
183 		delta_list = &delta_zone->delta_lists[middle];
184 		new_start = delta_zone->new_offsets[middle];
185 
186 		/*
187 		 * The direction that our middle list is moving determines which half of the
188 		 * problem must be processed first.
189 		 */
190 		if (new_start > delta_list->start) {
191 			rebalance_delta_zone(delta_zone, middle + 1, last);
192 			rebalance_delta_zone(delta_zone, first, middle);
193 		} else {
194 			rebalance_delta_zone(delta_zone, first, middle);
195 			rebalance_delta_zone(delta_zone, middle + 1, last);
196 		}
197 	}
198 }
199 
get_zone_memory_size(unsigned int zone_count,size_t memory_size)200 static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
201 {
202 	/* Round up so that each zone is a multiple of 64K in size. */
203 	size_t ALLOC_BOUNDARY = 64 * 1024;
204 
205 	return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
206 }
207 
uds_reset_delta_index(const struct delta_index * delta_index)208 void uds_reset_delta_index(const struct delta_index *delta_index)
209 {
210 	unsigned int z;
211 
212 	/*
213 	 * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
214 	 * before the first real entry and one after so that we don't need to bounds check the
215 	 * array access when calculating preceding and following gap sizes.
216 	 */
217 	for (z = 0; z < delta_index->zone_count; z++) {
218 		u64 list_bits;
219 		u64 spacing;
220 		u64 offset;
221 		unsigned int i;
222 		struct delta_zone *zone = &delta_index->delta_zones[z];
223 		struct delta_list *delta_lists = zone->delta_lists;
224 
225 		/* Zeroing the delta list headers initializes the head guard list correctly. */
226 		memset(delta_lists, 0,
227 		       (zone->list_count + 2) * sizeof(struct delta_list));
228 
229 		/* Set all the bits in the end guard list. */
230 		list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
231 		delta_lists[zone->list_count + 1].start = list_bits;
232 		delta_lists[zone->list_count + 1].size = GUARD_BITS;
233 		memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
234 		       POST_FIELD_GUARD_BYTES);
235 
236 		/* Evenly space out the real delta lists by setting regular offsets. */
237 		spacing = list_bits / zone->list_count;
238 		offset = spacing / 2;
239 		for (i = 1; i <= zone->list_count; i++) {
240 			delta_lists[i].start = offset;
241 			offset += spacing;
242 		}
243 
244 		/* Update the statistics. */
245 		zone->discard_count += zone->record_count;
246 		zone->record_count = 0;
247 		zone->collision_count = 0;
248 	}
249 }
250 
251 /* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
252  * three parameters:
253  *
254  *  MINBITS   The number of bits in the smallest code
255  *  BASE      The number of values coded using a code of length MINBITS
256  *  INCR      The number of values coded by using one additional bit
257  *
258  * These parameters are related by this equation:
259  *
260  *	BASE + INCR == 1 << MINBITS
261  *
262  * The math for the Huffman code of an exponential distribution says that
263  *
264  *	INCR = log(2) * MEAN_DELTA
265  *
266  * Then use the smallest MINBITS value so that
267  *
268  *	(1 << MINBITS) > INCR
269  *
270  * And then
271  *
272  *	BASE = (1 << MINBITS) - INCR
273  *
274  * Now the index can generate a code such that
275  * - The first BASE values code using MINBITS bits.
276  * - The next INCR values code using MINBITS+1 bits.
277  * - The next INCR values code using MINBITS+2 bits.
278  * - (and so on).
279  */
compute_coding_constants(u32 mean_delta,u16 * min_bits,u32 * min_keys,u32 * incr_keys)280 static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
281 {
282 	/*
283 	 * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
284 	 * floating point, use a really good integer approximation.
285 	 */
286 	*incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
287 	*min_bits = bits_per(*incr_keys + 1);
288 	*min_keys = (1 << *min_bits) - *incr_keys;
289 }
290 
uds_uninitialize_delta_index(struct delta_index * delta_index)291 void uds_uninitialize_delta_index(struct delta_index *delta_index)
292 {
293 	unsigned int z;
294 
295 	if (delta_index->delta_zones == NULL)
296 		return;
297 
298 	for (z = 0; z < delta_index->zone_count; z++) {
299 		vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
300 		vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
301 		vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
302 	}
303 
304 	vdo_free(delta_index->delta_zones);
305 	memset(delta_index, 0, sizeof(struct delta_index));
306 }
307 
initialize_delta_zone(struct delta_zone * delta_zone,size_t size,u32 first_list,u32 list_count,u32 mean_delta,u32 payload_bits,u8 tag)308 static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
309 				 u32 first_list, u32 list_count, u32 mean_delta,
310 				 u32 payload_bits, u8 tag)
311 {
312 	int result;
313 
314 	result = vdo_allocate(size, "delta list", &delta_zone->memory);
315 	if (result != VDO_SUCCESS)
316 		return result;
317 
318 	result = vdo_allocate(list_count + 2, "delta list temp", &delta_zone->new_offsets);
319 	if (result != VDO_SUCCESS)
320 		return result;
321 
322 	/* Allocate the delta lists. */
323 	result = vdo_allocate(list_count + 2, "delta lists", &delta_zone->delta_lists);
324 	if (result != VDO_SUCCESS)
325 		return result;
326 
327 	compute_coding_constants(mean_delta, &delta_zone->min_bits,
328 				 &delta_zone->min_keys, &delta_zone->incr_keys);
329 	delta_zone->value_bits = payload_bits;
330 	delta_zone->buffered_writer = NULL;
331 	delta_zone->size = size;
332 	delta_zone->rebalance_time = 0;
333 	delta_zone->rebalance_count = 0;
334 	delta_zone->record_count = 0;
335 	delta_zone->collision_count = 0;
336 	delta_zone->discard_count = 0;
337 	delta_zone->overflow_count = 0;
338 	delta_zone->first_list = first_list;
339 	delta_zone->list_count = list_count;
340 	delta_zone->tag = tag;
341 
342 	return UDS_SUCCESS;
343 }
344 
uds_initialize_delta_index(struct delta_index * delta_index,unsigned int zone_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t memory_size,u8 tag)345 int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
346 			       u32 list_count, u32 mean_delta, u32 payload_bits,
347 			       size_t memory_size, u8 tag)
348 {
349 	int result;
350 	unsigned int z;
351 	size_t zone_memory;
352 
353 	result = vdo_allocate(zone_count, "Delta Index Zones", &delta_index->delta_zones);
354 	if (result != VDO_SUCCESS)
355 		return result;
356 
357 	delta_index->zone_count = zone_count;
358 	delta_index->list_count = list_count;
359 	delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
360 	delta_index->memory_size = 0;
361 	delta_index->mutable = true;
362 	delta_index->tag = tag;
363 
364 	for (z = 0; z < zone_count; z++) {
365 		u32 lists_in_zone = delta_index->lists_per_zone;
366 		u32 first_list_in_zone = z * lists_in_zone;
367 
368 		if (z == zone_count - 1) {
369 			/*
370 			 * The last zone gets fewer lists if zone_count doesn't evenly divide
371 			 * list_count. We'll have an underflow if the assertion below doesn't hold.
372 			 */
373 			if (delta_index->list_count <= first_list_in_zone) {
374 				uds_uninitialize_delta_index(delta_index);
375 				return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
376 							      "%u delta lists not enough for %u zones",
377 							      list_count, zone_count);
378 			}
379 			lists_in_zone = delta_index->list_count - first_list_in_zone;
380 		}
381 
382 		zone_memory = get_zone_memory_size(zone_count, memory_size);
383 		result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
384 					       first_list_in_zone, lists_in_zone,
385 					       mean_delta, payload_bits, tag);
386 		if (result != UDS_SUCCESS) {
387 			uds_uninitialize_delta_index(delta_index);
388 			return result;
389 		}
390 
391 		delta_index->memory_size +=
392 			(sizeof(struct delta_zone) + zone_memory +
393 			 (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
394 	}
395 
396 	uds_reset_delta_index(delta_index);
397 	return UDS_SUCCESS;
398 }
399 
400 /* Read a bit field from an arbitrary bit boundary. */
get_field(const u8 * memory,u64 offset,u8 size)401 static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
402 {
403 	const void *addr = memory + offset / BITS_PER_BYTE;
404 
405 	return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
406 }
407 
408 /* Write a bit field to an arbitrary bit boundary. */
set_field(u32 value,u8 * memory,u64 offset,u8 size)409 static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
410 {
411 	void *addr = memory + offset / BITS_PER_BYTE;
412 	int shift = offset % BITS_PER_BYTE;
413 	u32 data = get_unaligned_le32(addr);
414 
415 	data &= ~(((1 << size) - 1) << shift);
416 	data |= value << shift;
417 	put_unaligned_le32(data, addr);
418 }
419 
420 /* Get the bit offset to the immutable delta list header. */
get_immutable_header_offset(u32 list_number)421 static inline u32 get_immutable_header_offset(u32 list_number)
422 {
423 	return sizeof(struct delta_page_header) * BITS_PER_BYTE +
424 		list_number * IMMUTABLE_HEADER_SIZE;
425 }
426 
427 /* Get the bit offset to the start of the immutable delta list bit stream. */
get_immutable_start(const u8 * memory,u32 list_number)428 static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
429 {
430 	return get_field(memory, get_immutable_header_offset(list_number),
431 			 IMMUTABLE_HEADER_SIZE);
432 }
433 
434 /* Set the bit offset to the start of the immutable delta list bit stream. */
set_immutable_start(u8 * memory,u32 list_number,u32 start)435 static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
436 {
437 	set_field(start, memory, get_immutable_header_offset(list_number),
438 		  IMMUTABLE_HEADER_SIZE);
439 }
440 
verify_delta_index_page(u64 nonce,u16 list_count,u64 expected_nonce,u8 * memory,size_t memory_size)441 static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
442 				    u8 *memory, size_t memory_size)
443 {
444 	unsigned int i;
445 
446 	/*
447 	 * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
448 	 * entire volume at least once.
449 	 */
450 	if (nonce != expected_nonce)
451 		return false;
452 
453 	/* Verify that the number of delta lists can fit in the page. */
454 	if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
455 			  BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
456 		return false;
457 
458 	/*
459 	 * Verify that the first delta list is immediately after the last delta
460 	 * list header.
461 	 */
462 	if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
463 		return false;
464 
465 	/* Verify that the lists are in the correct order. */
466 	for (i = 0; i < list_count; i++) {
467 		if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
468 			return false;
469 	}
470 
471 	/*
472 	 * Verify that the last list ends on the page, and that there is room
473 	 * for the post-field guard bits.
474 	 */
475 	if (get_immutable_start(memory, list_count) >
476 	    (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
477 		return false;
478 
479 	/* Verify that the guard bytes are correctly set to all ones. */
480 	for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
481 		if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
482 			return false;
483 	}
484 
485 	/* All verifications passed. */
486 	return true;
487 }
488 
489 /* Initialize a delta index page to refer to a supplied page. */
uds_initialize_delta_index_page(struct delta_index_page * delta_index_page,u64 expected_nonce,u32 mean_delta,u32 payload_bits,u8 * memory,size_t memory_size)490 int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
491 				    u64 expected_nonce, u32 mean_delta, u32 payload_bits,
492 				    u8 *memory, size_t memory_size)
493 {
494 	u64 nonce;
495 	u64 vcn;
496 	u64 first_list;
497 	u64 list_count;
498 	struct delta_page_header *header = (struct delta_page_header *) memory;
499 	struct delta_zone *delta_zone = &delta_index_page->delta_zone;
500 	const u8 *nonce_addr = (const u8 *) &header->nonce;
501 	const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
502 	const u8 *first_list_addr = (const u8 *) &header->first_list;
503 	const u8 *list_count_addr = (const u8 *) &header->list_count;
504 
505 	/* First assume that the header is little endian. */
506 	nonce = get_unaligned_le64(nonce_addr);
507 	vcn = get_unaligned_le64(vcn_addr);
508 	first_list = get_unaligned_le16(first_list_addr);
509 	list_count = get_unaligned_le16(list_count_addr);
510 	if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
511 				     memory_size)) {
512 		/* If that fails, try big endian. */
513 		nonce = get_unaligned_be64(nonce_addr);
514 		vcn = get_unaligned_be64(vcn_addr);
515 		first_list = get_unaligned_be16(first_list_addr);
516 		list_count = get_unaligned_be16(list_count_addr);
517 		if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
518 					     memory_size)) {
519 			/*
520 			 * Both attempts failed. Do not log this as an error, because it can happen
521 			 * during a rebuild if we haven't written the entire volume at least once.
522 			 */
523 			return UDS_CORRUPT_DATA;
524 		}
525 	}
526 
527 	delta_index_page->delta_index.delta_zones = delta_zone;
528 	delta_index_page->delta_index.zone_count = 1;
529 	delta_index_page->delta_index.list_count = list_count;
530 	delta_index_page->delta_index.lists_per_zone = list_count;
531 	delta_index_page->delta_index.mutable = false;
532 	delta_index_page->delta_index.tag = 'p';
533 	delta_index_page->virtual_chapter_number = vcn;
534 	delta_index_page->lowest_list_number = first_list;
535 	delta_index_page->highest_list_number = first_list + list_count - 1;
536 
537 	compute_coding_constants(mean_delta, &delta_zone->min_bits,
538 				 &delta_zone->min_keys, &delta_zone->incr_keys);
539 	delta_zone->value_bits = payload_bits;
540 	delta_zone->memory = memory;
541 	delta_zone->delta_lists = NULL;
542 	delta_zone->new_offsets = NULL;
543 	delta_zone->buffered_writer = NULL;
544 	delta_zone->size = memory_size;
545 	delta_zone->rebalance_time = 0;
546 	delta_zone->rebalance_count = 0;
547 	delta_zone->record_count = 0;
548 	delta_zone->collision_count = 0;
549 	delta_zone->discard_count = 0;
550 	delta_zone->overflow_count = 0;
551 	delta_zone->first_list = 0;
552 	delta_zone->list_count = list_count;
553 	delta_zone->tag = 'p';
554 
555 	return UDS_SUCCESS;
556 }
557 
558 /* Read a large bit field from an arbitrary bit boundary. */
get_big_field(const u8 * memory,u64 offset,u8 size)559 static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
560 {
561 	const void *addr = memory + offset / BITS_PER_BYTE;
562 
563 	return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
564 }
565 
566 /* Write a large bit field to an arbitrary bit boundary. */
set_big_field(u64 value,u8 * memory,u64 offset,u8 size)567 static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
568 {
569 	void *addr = memory + offset / BITS_PER_BYTE;
570 	u8 shift = offset % BITS_PER_BYTE;
571 	u64 data = get_unaligned_le64(addr);
572 
573 	data &= ~(((1UL << size) - 1) << shift);
574 	data |= value << shift;
575 	put_unaligned_le64(data, addr);
576 }
577 
578 /* Set a sequence of bits to all zeros. */
set_zero(u8 * memory,u64 offset,u32 size)579 static inline void set_zero(u8 *memory, u64 offset, u32 size)
580 {
581 	if (size > 0) {
582 		u8 *addr = memory + offset / BITS_PER_BYTE;
583 		u8 shift = offset % BITS_PER_BYTE;
584 		u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
585 
586 		*addr++ &= ~(((1 << count) - 1) << shift);
587 		for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
588 			*addr++ = 0;
589 
590 		if (size > 0)
591 			*addr &= 0xFF << size;
592 	}
593 }
594 
595 /*
596  * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
597  * size and memory offsets are measured in bits.
598  */
move_bits_down(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)599 static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
600 {
601 	const u8 *source;
602 	u8 *destination;
603 	u8 offset;
604 	u8 count;
605 	u64 field;
606 
607 	/* Start by moving one field that ends on a to int boundary. */
608 	count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
609 	field = get_big_field(from, from_offset, count);
610 	set_big_field(field, to, to_offset, count);
611 	from_offset += count;
612 	to_offset += count;
613 	size -= count;
614 
615 	/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
616 	offset = from_offset % BITS_PER_TYPE(u32);
617 	source = from + (from_offset - offset) / BITS_PER_BYTE;
618 	destination = to + to_offset / BITS_PER_BYTE;
619 	while (size > MAX_BIG_FIELD_BITS) {
620 		put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
621 		source += sizeof(u32);
622 		destination += sizeof(u32);
623 		from_offset += BITS_PER_TYPE(u32);
624 		to_offset += BITS_PER_TYPE(u32);
625 		size -= BITS_PER_TYPE(u32);
626 	}
627 
628 	/* Finish up by moving any remaining bits. */
629 	if (size > 0) {
630 		field = get_big_field(from, from_offset, size);
631 		set_big_field(field, to, to_offset, size);
632 	}
633 }
634 
635 /*
636  * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
637  * size and memory offsets are measured in bits.
638  */
move_bits_up(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)639 static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
640 {
641 	const u8 *source;
642 	u8 *destination;
643 	u8 offset;
644 	u8 count;
645 	u64 field;
646 
647 	/* Start by moving one field that begins on a destination int boundary. */
648 	count = (to_offset + size) % BITS_PER_TYPE(u32);
649 	if (count > 0) {
650 		size -= count;
651 		field = get_big_field(from, from_offset + size, count);
652 		set_big_field(field, to, to_offset + size, count);
653 	}
654 
655 	/* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
656 	offset = (from_offset + size) % BITS_PER_TYPE(u32);
657 	source = from + (from_offset + size - offset) / BITS_PER_BYTE;
658 	destination = to + (to_offset + size) / BITS_PER_BYTE;
659 	while (size > MAX_BIG_FIELD_BITS) {
660 		source -= sizeof(u32);
661 		destination -= sizeof(u32);
662 		size -= BITS_PER_TYPE(u32);
663 		put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
664 	}
665 
666 	/* Finish up by moving any remaining bits. */
667 	if (size > 0) {
668 		field = get_big_field(from, from_offset, size);
669 		set_big_field(field, to, to_offset, size);
670 	}
671 }
672 
673 /*
674  * Move bits from one field to another. When the fields overlap, behave as if we first move all the
675  * bits from the source to a temporary value, and then move all the bits from the temporary value
676  * to the destination. The size and memory offsets are measured in bits.
677  */
move_bits(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)678 static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
679 {
680 	u64 field;
681 
682 	/* A small move doesn't require special handling. */
683 	if (size <= MAX_BIG_FIELD_BITS) {
684 		if (size > 0) {
685 			field = get_big_field(from, from_offset, size);
686 			set_big_field(field, to, to_offset, size);
687 		}
688 
689 		return;
690 	}
691 
692 	if (from_offset > to_offset)
693 		move_bits_down(from, from_offset, to, to_offset, size);
694 	else
695 		move_bits_up(from, from_offset, to, to_offset, size);
696 }
697 
698 /*
699  * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
700  * lists (starting with a specified list index) is copied from the mutable delta index into a
701  * memory page used in the immutable index. The number of lists copied onto the page is returned in
702  * list_count.
703  */
uds_pack_delta_index_page(const struct delta_index * delta_index,u64 header_nonce,u8 * memory,size_t memory_size,u64 virtual_chapter_number,u32 first_list,u32 * list_count)704 int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
705 			      u8 *memory, size_t memory_size, u64 virtual_chapter_number,
706 			      u32 first_list, u32 *list_count)
707 {
708 	const struct delta_zone *delta_zone;
709 	struct delta_list *delta_lists;
710 	u32 max_lists;
711 	u32 n_lists = 0;
712 	u32 offset;
713 	u32 i;
714 	int free_bits;
715 	int bits;
716 	struct delta_page_header *header;
717 
718 	delta_zone = &delta_index->delta_zones[0];
719 	delta_lists = &delta_zone->delta_lists[first_list + 1];
720 	max_lists = delta_index->list_count - first_list;
721 
722 	/*
723 	 * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
724 	 * delta list offset, and the guard bytes from the page size to determine how much space is
725 	 * available for delta lists.
726 	 */
727 	free_bits = memory_size * BITS_PER_BYTE;
728 	free_bits -= get_immutable_header_offset(1);
729 	free_bits -= GUARD_BITS;
730 	if (free_bits < IMMUTABLE_HEADER_SIZE) {
731 		/* This page is too small to store any delta lists. */
732 		return vdo_log_error_strerror(UDS_OVERFLOW,
733 					      "Chapter Index Page of %zu bytes is too small",
734 					      memory_size);
735 	}
736 
737 	while (n_lists < max_lists) {
738 		/* Each list requires a delta list offset and the list data. */
739 		bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
740 		if (bits > free_bits)
741 			break;
742 
743 		n_lists++;
744 		free_bits -= bits;
745 	}
746 
747 	*list_count = n_lists;
748 
749 	header = (struct delta_page_header *) memory;
750 	put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
751 	put_unaligned_le64(virtual_chapter_number,
752 			   (u8 *) &header->virtual_chapter_number);
753 	put_unaligned_le16(first_list, (u8 *) &header->first_list);
754 	put_unaligned_le16(n_lists, (u8 *) &header->list_count);
755 
756 	/* Construct the delta list offset table. */
757 	offset = get_immutable_header_offset(n_lists + 1);
758 	set_immutable_start(memory, 0, offset);
759 	for (i = 0; i < n_lists; i++) {
760 		offset += delta_lists[i].size;
761 		set_immutable_start(memory, i + 1, offset);
762 	}
763 
764 	/* Copy the delta list data onto the memory page. */
765 	for (i = 0; i < n_lists; i++) {
766 		move_bits(delta_zone->memory, delta_lists[i].start, memory,
767 			  get_immutable_start(memory, i), delta_lists[i].size);
768 	}
769 
770 	/* Set all the bits in the guard bytes. */
771 	memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
772 	       POST_FIELD_GUARD_BYTES);
773 	return UDS_SUCCESS;
774 }
775 
776 /* Compute the new offsets of the delta lists. */
compute_new_list_offsets(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size,size_t used_space)777 static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
778 				     size_t growing_size, size_t used_space)
779 {
780 	size_t spacing;
781 	u32 i;
782 	struct delta_list *delta_lists = delta_zone->delta_lists;
783 	u32 tail_guard_index = delta_zone->list_count + 1;
784 
785 	spacing = (delta_zone->size - used_space) / delta_zone->list_count;
786 	delta_zone->new_offsets[0] = 0;
787 	for (i = 0; i <= delta_zone->list_count; i++) {
788 		delta_zone->new_offsets[i + 1] =
789 			(delta_zone->new_offsets[i] +
790 			 get_delta_list_byte_size(&delta_lists[i]) + spacing);
791 		delta_zone->new_offsets[i] *= BITS_PER_BYTE;
792 		delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
793 		if (i == 0)
794 			delta_zone->new_offsets[i + 1] -= spacing / 2;
795 		if (i + 1 == growing_index)
796 			delta_zone->new_offsets[i + 1] += growing_size;
797 	}
798 
799 	delta_zone->new_offsets[tail_guard_index] =
800 		(delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
801 }
802 
rebalance_lists(struct delta_zone * delta_zone)803 static void rebalance_lists(struct delta_zone *delta_zone)
804 {
805 	struct delta_list *delta_lists;
806 	u32 i;
807 	size_t used_space = 0;
808 
809 	/* Extend and balance memory to receive the delta lists */
810 	delta_lists = delta_zone->delta_lists;
811 	for (i = 0; i <= delta_zone->list_count + 1; i++)
812 		used_space += get_delta_list_byte_size(&delta_lists[i]);
813 
814 	compute_new_list_offsets(delta_zone, 0, 0, used_space);
815 	for (i = 1; i <= delta_zone->list_count + 1; i++)
816 		delta_lists[i].start = delta_zone->new_offsets[i];
817 }
818 
819 /* Start restoring a delta index from multiple input streams. */
uds_start_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)820 int uds_start_restoring_delta_index(struct delta_index *delta_index,
821 				    struct buffered_reader **buffered_readers,
822 				    unsigned int reader_count)
823 {
824 	int result;
825 	unsigned int zone_count = reader_count;
826 	u64 record_count = 0;
827 	u64 collision_count = 0;
828 	u32 first_list[MAX_ZONES];
829 	u32 list_count[MAX_ZONES];
830 	unsigned int z;
831 	u32 list_next = 0;
832 	const struct delta_zone *delta_zone;
833 
834 	/* Read and validate each header. */
835 	for (z = 0; z < zone_count; z++) {
836 		struct delta_index_header header;
837 		u8 buffer[sizeof(struct delta_index_header)];
838 		size_t offset = 0;
839 
840 		result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
841 						       sizeof(buffer));
842 		if (result != UDS_SUCCESS) {
843 			return vdo_log_warning_strerror(result,
844 							"failed to read delta index header");
845 		}
846 
847 		memcpy(&header.magic, buffer, MAGIC_SIZE);
848 		offset += MAGIC_SIZE;
849 		decode_u32_le(buffer, &offset, &header.zone_number);
850 		decode_u32_le(buffer, &offset, &header.zone_count);
851 		decode_u32_le(buffer, &offset, &header.first_list);
852 		decode_u32_le(buffer, &offset, &header.list_count);
853 		decode_u64_le(buffer, &offset, &header.record_count);
854 		decode_u64_le(buffer, &offset, &header.collision_count);
855 
856 		result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
857 				    "%zu bytes decoded of %zu expected", offset,
858 				    sizeof(struct delta_index_header));
859 		if (result != VDO_SUCCESS) {
860 			return vdo_log_warning_strerror(result,
861 							"failed to read delta index header");
862 		}
863 
864 		if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
865 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
866 							"delta index file has bad magic number");
867 		}
868 
869 		if (zone_count != header.zone_count) {
870 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
871 							"delta index files contain mismatched zone counts (%u,%u)",
872 							zone_count, header.zone_count);
873 		}
874 
875 		if (header.zone_number != z) {
876 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
877 							"delta index zone %u found in slot %u",
878 							header.zone_number, z);
879 		}
880 
881 		first_list[z] = header.first_list;
882 		list_count[z] = header.list_count;
883 		record_count += header.record_count;
884 		collision_count += header.collision_count;
885 
886 		if (first_list[z] != list_next) {
887 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
888 							"delta index file for zone %u starts with list %u instead of list %u",
889 							z, first_list[z], list_next);
890 		}
891 
892 		list_next += list_count[z];
893 	}
894 
895 	if (list_next != delta_index->list_count) {
896 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
897 						"delta index files contain %u delta lists instead of %u delta lists",
898 						list_next, delta_index->list_count);
899 	}
900 
901 	if (collision_count > record_count) {
902 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
903 						"delta index files contain %llu collisions and %llu records",
904 						(unsigned long long) collision_count,
905 						(unsigned long long) record_count);
906 	}
907 
908 	uds_reset_delta_index(delta_index);
909 	delta_index->delta_zones[0].record_count = record_count;
910 	delta_index->delta_zones[0].collision_count = collision_count;
911 
912 	/* Read the delta lists and distribute them to the proper zones. */
913 	for (z = 0; z < zone_count; z++) {
914 		u32 i;
915 
916 		delta_index->load_lists[z] = 0;
917 		for (i = 0; i < list_count[z]; i++) {
918 			u16 delta_list_size;
919 			u32 list_number;
920 			unsigned int zone_number;
921 			u8 size_data[sizeof(u16)];
922 
923 			result = uds_read_from_buffered_reader(buffered_readers[z],
924 							       size_data,
925 							       sizeof(size_data));
926 			if (result != UDS_SUCCESS) {
927 				return vdo_log_warning_strerror(result,
928 								"failed to read delta index size");
929 			}
930 
931 			delta_list_size = get_unaligned_le16(size_data);
932 			if (delta_list_size > 0)
933 				delta_index->load_lists[z] += 1;
934 
935 			list_number = first_list[z] + i;
936 			zone_number = list_number / delta_index->lists_per_zone;
937 			delta_zone = &delta_index->delta_zones[zone_number];
938 			list_number -= delta_zone->first_list;
939 			delta_zone->delta_lists[list_number + 1].size = delta_list_size;
940 		}
941 	}
942 
943 	/* Prepare each zone to start receiving the delta list data. */
944 	for (z = 0; z < delta_index->zone_count; z++)
945 		rebalance_lists(&delta_index->delta_zones[z]);
946 
947 	return UDS_SUCCESS;
948 }
949 
restore_delta_list_to_zone(struct delta_zone * delta_zone,const struct delta_list_save_info * save_info,const u8 * data)950 static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
951 				      const struct delta_list_save_info *save_info,
952 				      const u8 *data)
953 {
954 	struct delta_list *delta_list;
955 	u16 bit_count;
956 	u16 byte_count;
957 	u32 list_number = save_info->index - delta_zone->first_list;
958 
959 	if (list_number >= delta_zone->list_count) {
960 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
961 						"invalid delta list number %u not in range [%u,%u)",
962 						save_info->index, delta_zone->first_list,
963 						delta_zone->first_list + delta_zone->list_count);
964 	}
965 
966 	delta_list = &delta_zone->delta_lists[list_number + 1];
967 	if (delta_list->size == 0) {
968 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
969 						"unexpected delta list number %u",
970 						save_info->index);
971 	}
972 
973 	bit_count = delta_list->size + save_info->bit_offset;
974 	byte_count = BITS_TO_BYTES(bit_count);
975 	if (save_info->byte_count != byte_count) {
976 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
977 						"unexpected delta list size %u != %u",
978 						save_info->byte_count, byte_count);
979 	}
980 
981 	move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
982 		  delta_list->size);
983 	return UDS_SUCCESS;
984 }
985 
restore_delta_list_data(struct delta_index * delta_index,unsigned int load_zone,struct buffered_reader * buffered_reader,u8 * data)986 static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
987 				   struct buffered_reader *buffered_reader, u8 *data)
988 {
989 	int result;
990 	struct delta_list_save_info save_info;
991 	u8 buffer[sizeof(struct delta_list_save_info)];
992 	unsigned int new_zone;
993 
994 	result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
995 	if (result != UDS_SUCCESS) {
996 		return vdo_log_warning_strerror(result,
997 						"failed to read delta list data");
998 	}
999 
1000 	save_info = (struct delta_list_save_info) {
1001 		.tag = buffer[0],
1002 		.bit_offset = buffer[1],
1003 		.byte_count = get_unaligned_le16(&buffer[2]),
1004 		.index = get_unaligned_le32(&buffer[4]),
1005 	};
1006 
1007 	if ((save_info.bit_offset >= BITS_PER_BYTE) ||
1008 	    (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
1009 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1010 						"corrupt delta list data");
1011 	}
1012 
1013 	/* Make sure the data is intended for this delta index. */
1014 	if (save_info.tag != delta_index->tag)
1015 		return UDS_CORRUPT_DATA;
1016 
1017 	if (save_info.index >= delta_index->list_count) {
1018 		return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1019 						"invalid delta list number %u of %u",
1020 						save_info.index,
1021 						delta_index->list_count);
1022 	}
1023 
1024 	result = uds_read_from_buffered_reader(buffered_reader, data,
1025 					       save_info.byte_count);
1026 	if (result != UDS_SUCCESS) {
1027 		return vdo_log_warning_strerror(result,
1028 						"failed to read delta list data");
1029 	}
1030 
1031 	delta_index->load_lists[load_zone] -= 1;
1032 	new_zone = save_info.index / delta_index->lists_per_zone;
1033 	return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
1034 					  &save_info, data);
1035 }
1036 
1037 /* Restore delta lists from saved data. */
uds_finish_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)1038 int uds_finish_restoring_delta_index(struct delta_index *delta_index,
1039 				     struct buffered_reader **buffered_readers,
1040 				     unsigned int reader_count)
1041 {
1042 	int result;
1043 	int saved_result = UDS_SUCCESS;
1044 	unsigned int z;
1045 	u8 *data;
1046 
1047 	result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, __func__, &data);
1048 	if (result != VDO_SUCCESS)
1049 		return result;
1050 
1051 	for (z = 0; z < reader_count; z++) {
1052 		while (delta_index->load_lists[z] > 0) {
1053 			result = restore_delta_list_data(delta_index, z,
1054 							 buffered_readers[z], data);
1055 			if (result != UDS_SUCCESS) {
1056 				saved_result = result;
1057 				break;
1058 			}
1059 		}
1060 	}
1061 
1062 	vdo_free(data);
1063 	return saved_result;
1064 }
1065 
uds_check_guard_delta_lists(struct buffered_reader ** buffered_readers,unsigned int reader_count)1066 int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
1067 				unsigned int reader_count)
1068 {
1069 	int result;
1070 	unsigned int z;
1071 	u8 buffer[sizeof(struct delta_list_save_info)];
1072 
1073 	for (z = 0; z < reader_count; z++) {
1074 		result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
1075 						       sizeof(buffer));
1076 		if (result != UDS_SUCCESS)
1077 			return result;
1078 
1079 		if (buffer[0] != 'z')
1080 			return UDS_CORRUPT_DATA;
1081 	}
1082 
1083 	return UDS_SUCCESS;
1084 }
1085 
flush_delta_list(struct delta_zone * zone,u32 flush_index)1086 static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
1087 {
1088 	struct delta_list *delta_list;
1089 	u8 buffer[sizeof(struct delta_list_save_info)];
1090 	int result;
1091 
1092 	delta_list = &zone->delta_lists[flush_index + 1];
1093 
1094 	buffer[0] = zone->tag;
1095 	buffer[1] = delta_list->start % BITS_PER_BYTE;
1096 	put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
1097 	put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
1098 
1099 	result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
1100 					      sizeof(buffer));
1101 	if (result != UDS_SUCCESS) {
1102 		vdo_log_warning_strerror(result, "failed to write delta list memory");
1103 		return result;
1104 	}
1105 
1106 	result = uds_write_to_buffered_writer(zone->buffered_writer,
1107 					      zone->memory + get_delta_list_byte_start(delta_list),
1108 					      get_delta_list_byte_size(delta_list));
1109 	if (result != UDS_SUCCESS)
1110 		vdo_log_warning_strerror(result, "failed to write delta list memory");
1111 
1112 	return result;
1113 }
1114 
1115 /* Start saving a delta index zone to a buffered output stream. */
uds_start_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number,struct buffered_writer * buffered_writer)1116 int uds_start_saving_delta_index(const struct delta_index *delta_index,
1117 				 unsigned int zone_number,
1118 				 struct buffered_writer *buffered_writer)
1119 {
1120 	int result;
1121 	u32 i;
1122 	struct delta_zone *delta_zone;
1123 	u8 buffer[sizeof(struct delta_index_header)];
1124 	size_t offset = 0;
1125 
1126 	delta_zone = &delta_index->delta_zones[zone_number];
1127 	memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
1128 	offset += MAGIC_SIZE;
1129 	encode_u32_le(buffer, &offset, zone_number);
1130 	encode_u32_le(buffer, &offset, delta_index->zone_count);
1131 	encode_u32_le(buffer, &offset, delta_zone->first_list);
1132 	encode_u32_le(buffer, &offset, delta_zone->list_count);
1133 	encode_u64_le(buffer, &offset, delta_zone->record_count);
1134 	encode_u64_le(buffer, &offset, delta_zone->collision_count);
1135 
1136 	result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
1137 			    "%zu bytes encoded of %zu expected", offset,
1138 			    sizeof(struct delta_index_header));
1139 	if (result != VDO_SUCCESS)
1140 		return result;
1141 
1142 	result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
1143 	if (result != UDS_SUCCESS)
1144 		return vdo_log_warning_strerror(result,
1145 						"failed to write delta index header");
1146 
1147 	for (i = 0; i < delta_zone->list_count; i++) {
1148 		u8 data[sizeof(u16)];
1149 		struct delta_list *delta_list;
1150 
1151 		delta_list = &delta_zone->delta_lists[i + 1];
1152 		put_unaligned_le16(delta_list->size, data);
1153 		result = uds_write_to_buffered_writer(buffered_writer, data,
1154 						      sizeof(data));
1155 		if (result != UDS_SUCCESS)
1156 			return vdo_log_warning_strerror(result,
1157 							"failed to write delta list size");
1158 	}
1159 
1160 	delta_zone->buffered_writer = buffered_writer;
1161 	return UDS_SUCCESS;
1162 }
1163 
uds_finish_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number)1164 int uds_finish_saving_delta_index(const struct delta_index *delta_index,
1165 				  unsigned int zone_number)
1166 {
1167 	int result;
1168 	int first_error = UDS_SUCCESS;
1169 	u32 i;
1170 	struct delta_zone *delta_zone;
1171 	struct delta_list *delta_list;
1172 
1173 	delta_zone = &delta_index->delta_zones[zone_number];
1174 	for (i = 0; i < delta_zone->list_count; i++) {
1175 		delta_list = &delta_zone->delta_lists[i + 1];
1176 		if (delta_list->size > 0) {
1177 			result = flush_delta_list(delta_zone, i);
1178 			if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
1179 				first_error = result;
1180 		}
1181 	}
1182 
1183 	delta_zone->buffered_writer = NULL;
1184 	return first_error;
1185 }
1186 
uds_write_guard_delta_list(struct buffered_writer * buffered_writer)1187 int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
1188 {
1189 	int result;
1190 	u8 buffer[sizeof(struct delta_list_save_info)];
1191 
1192 	memset(buffer, 0, sizeof(struct delta_list_save_info));
1193 	buffer[0] = 'z';
1194 
1195 	result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
1196 	if (result != UDS_SUCCESS)
1197 		vdo_log_warning_strerror(result, "failed to write guard delta list");
1198 
1199 	return UDS_SUCCESS;
1200 }
1201 
uds_compute_delta_index_save_bytes(u32 list_count,size_t memory_size)1202 size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
1203 {
1204 	/* One zone will use at least as much memory as other zone counts. */
1205 	return (sizeof(struct delta_index_header) +
1206 		list_count * (sizeof(struct delta_list_save_info) + 1) +
1207 		get_zone_memory_size(1, memory_size));
1208 }
1209 
assert_not_at_end(const struct delta_index_entry * delta_entry)1210 static int assert_not_at_end(const struct delta_index_entry *delta_entry)
1211 {
1212 	int result = VDO_ASSERT(!delta_entry->at_end,
1213 				"operation is invalid because the list entry is at the end of the delta list");
1214 	if (result != VDO_SUCCESS)
1215 		result = UDS_BAD_STATE;
1216 
1217 	return result;
1218 }
1219 
1220 /*
1221  * Prepare to search for an entry in the specified delta list.
1222  *
1223  * This is always the first function to be called when dealing with delta index entries. It is
1224  * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
1225  * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
1226  * entry from the list.
1227  */
uds_start_delta_index_search(const struct delta_index * delta_index,u32 list_number,u32 key,struct delta_index_entry * delta_entry)1228 int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
1229 				 u32 key, struct delta_index_entry *delta_entry)
1230 {
1231 	int result;
1232 	unsigned int zone_number;
1233 	struct delta_zone *delta_zone;
1234 	struct delta_list *delta_list;
1235 
1236 	result = VDO_ASSERT((list_number < delta_index->list_count),
1237 			    "Delta list number (%u) is out of range (%u)", list_number,
1238 			    delta_index->list_count);
1239 	if (result != VDO_SUCCESS)
1240 		return UDS_CORRUPT_DATA;
1241 
1242 	zone_number = list_number / delta_index->lists_per_zone;
1243 	delta_zone = &delta_index->delta_zones[zone_number];
1244 	list_number -= delta_zone->first_list;
1245 	result = VDO_ASSERT((list_number < delta_zone->list_count),
1246 			    "Delta list number (%u) is out of range (%u) for zone (%u)",
1247 			    list_number, delta_zone->list_count, zone_number);
1248 	if (result != VDO_SUCCESS)
1249 		return UDS_CORRUPT_DATA;
1250 
1251 	if (delta_index->mutable) {
1252 		delta_list = &delta_zone->delta_lists[list_number + 1];
1253 	} else {
1254 		u32 end_offset;
1255 
1256 		/*
1257 		 * Translate the immutable delta list header into a temporary
1258 		 * full delta list header.
1259 		 */
1260 		delta_list = &delta_entry->temp_delta_list;
1261 		delta_list->start = get_immutable_start(delta_zone->memory, list_number);
1262 		end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
1263 		delta_list->size = end_offset - delta_list->start;
1264 		delta_list->save_key = 0;
1265 		delta_list->save_offset = 0;
1266 	}
1267 
1268 	if (key > delta_list->save_key) {
1269 		delta_entry->key = delta_list->save_key;
1270 		delta_entry->offset = delta_list->save_offset;
1271 	} else {
1272 		delta_entry->key = 0;
1273 		delta_entry->offset = 0;
1274 		if (key == 0) {
1275 			/*
1276 			 * This usually means we're about to walk the entire delta list, so get all
1277 			 * of it into the CPU cache.
1278 			 */
1279 			uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
1280 					   delta_list->size / BITS_PER_BYTE, false);
1281 		}
1282 	}
1283 
1284 	delta_entry->at_end = false;
1285 	delta_entry->delta_zone = delta_zone;
1286 	delta_entry->delta_list = delta_list;
1287 	delta_entry->entry_bits = 0;
1288 	delta_entry->is_collision = false;
1289 	delta_entry->list_number = list_number;
1290 	delta_entry->list_overflow = false;
1291 	delta_entry->value_bits = delta_zone->value_bits;
1292 	return UDS_SUCCESS;
1293 }
1294 
get_delta_entry_offset(const struct delta_index_entry * delta_entry)1295 static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
1296 {
1297 	return delta_entry->delta_list->start + delta_entry->offset;
1298 }
1299 
1300 /*
1301  * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
1302  * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
1303  * bit stream and update the delta_list_entry to describe the entry.
1304  */
decode_delta(struct delta_index_entry * delta_entry)1305 static inline void decode_delta(struct delta_index_entry *delta_entry)
1306 {
1307 	int key_bits;
1308 	u32 delta;
1309 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1310 	const u8 *memory = delta_zone->memory;
1311 	u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1312 	const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
1313 	int offset = delta_offset % BITS_PER_BYTE;
1314 	u32 data = get_unaligned_le32(addr) >> offset;
1315 
1316 	addr += sizeof(u32);
1317 	key_bits = delta_zone->min_bits;
1318 	delta = data & ((1 << key_bits) - 1);
1319 	if (delta >= delta_zone->min_keys) {
1320 		data >>= key_bits;
1321 		if (data == 0) {
1322 			key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
1323 			while ((data = get_unaligned_le32(addr)) == 0) {
1324 				addr += sizeof(u32);
1325 				key_bits += sizeof(u32) * BITS_PER_BYTE;
1326 			}
1327 		}
1328 		key_bits += ffs(data);
1329 		delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
1330 	}
1331 	delta_entry->delta = delta;
1332 	delta_entry->key += delta;
1333 
1334 	/* Check for a collision, a delta of zero after the start. */
1335 	if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
1336 		delta_entry->is_collision = true;
1337 		delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
1338 	} else {
1339 		delta_entry->is_collision = false;
1340 		delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1341 	}
1342 }
1343 
uds_next_delta_index_entry(struct delta_index_entry * delta_entry)1344 noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
1345 {
1346 	int result;
1347 	const struct delta_list *delta_list;
1348 	u32 next_offset;
1349 	u16 size;
1350 
1351 	result = assert_not_at_end(delta_entry);
1352 	if (result != UDS_SUCCESS)
1353 		return result;
1354 
1355 	delta_list = delta_entry->delta_list;
1356 	delta_entry->offset += delta_entry->entry_bits;
1357 	size = delta_list->size;
1358 	if (unlikely(delta_entry->offset >= size)) {
1359 		delta_entry->at_end = true;
1360 		delta_entry->delta = 0;
1361 		delta_entry->is_collision = false;
1362 		result = VDO_ASSERT((delta_entry->offset == size),
1363 				    "next offset past end of delta list");
1364 		if (result != VDO_SUCCESS)
1365 			result = UDS_CORRUPT_DATA;
1366 
1367 		return result;
1368 	}
1369 
1370 	decode_delta(delta_entry);
1371 
1372 	next_offset = delta_entry->offset + delta_entry->entry_bits;
1373 	if (next_offset > size) {
1374 		/*
1375 		 * This is not an assertion because uds_validate_chapter_index_page() wants to
1376 		 * handle this error.
1377 		 */
1378 		vdo_log_warning("Decoded past the end of the delta list");
1379 		return UDS_CORRUPT_DATA;
1380 	}
1381 
1382 	return UDS_SUCCESS;
1383 }
1384 
uds_remember_delta_index_offset(const struct delta_index_entry * delta_entry)1385 int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
1386 {
1387 	int result;
1388 	struct delta_list *delta_list = delta_entry->delta_list;
1389 
1390 	result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
1391 	if (result != VDO_SUCCESS)
1392 		return result;
1393 
1394 	delta_list->save_key = delta_entry->key - delta_entry->delta;
1395 	delta_list->save_offset = delta_entry->offset;
1396 	return UDS_SUCCESS;
1397 }
1398 
set_delta(struct delta_index_entry * delta_entry,u32 delta)1399 static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
1400 {
1401 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1402 	u32 key_bits = (delta_zone->min_bits +
1403 			((delta_zone->incr_keys - delta_zone->min_keys + delta) /
1404 			 delta_zone->incr_keys));
1405 
1406 	delta_entry->delta = delta;
1407 	delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1408 }
1409 
get_collision_name(const struct delta_index_entry * entry,u8 * name)1410 static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
1411 {
1412 	u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1413 	const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1414 	int size = COLLISION_BYTES;
1415 	int shift = offset % BITS_PER_BYTE;
1416 
1417 	while (--size >= 0)
1418 		*name++ = get_unaligned_le16(addr++) >> shift;
1419 }
1420 
set_collision_name(const struct delta_index_entry * entry,const u8 * name)1421 static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
1422 {
1423 	u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1424 	u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1425 	int size = COLLISION_BYTES;
1426 	int shift = offset % BITS_PER_BYTE;
1427 	u16 mask = ~((u16) 0xFF << shift);
1428 	u16 data;
1429 
1430 	while (--size >= 0) {
1431 		data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
1432 		put_unaligned_le16(data, addr++);
1433 	}
1434 }
1435 
uds_get_delta_index_entry(const struct delta_index * delta_index,u32 list_number,u32 key,const u8 * name,struct delta_index_entry * delta_entry)1436 int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
1437 			      u32 key, const u8 *name,
1438 			      struct delta_index_entry *delta_entry)
1439 {
1440 	int result;
1441 
1442 	result = uds_start_delta_index_search(delta_index, list_number, key,
1443 					      delta_entry);
1444 	if (result != UDS_SUCCESS)
1445 		return result;
1446 
1447 	do {
1448 		result = uds_next_delta_index_entry(delta_entry);
1449 		if (result != UDS_SUCCESS)
1450 			return result;
1451 	} while (!delta_entry->at_end && (key > delta_entry->key));
1452 
1453 	result = uds_remember_delta_index_offset(delta_entry);
1454 	if (result != UDS_SUCCESS)
1455 		return result;
1456 
1457 	if (!delta_entry->at_end && (key == delta_entry->key)) {
1458 		struct delta_index_entry collision_entry = *delta_entry;
1459 
1460 		for (;;) {
1461 			u8 full_name[COLLISION_BYTES];
1462 
1463 			result = uds_next_delta_index_entry(&collision_entry);
1464 			if (result != UDS_SUCCESS)
1465 				return result;
1466 
1467 			if (collision_entry.at_end || !collision_entry.is_collision)
1468 				break;
1469 
1470 			get_collision_name(&collision_entry, full_name);
1471 			if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
1472 				*delta_entry = collision_entry;
1473 				break;
1474 			}
1475 		}
1476 	}
1477 
1478 	return UDS_SUCCESS;
1479 }
1480 
uds_get_delta_entry_collision(const struct delta_index_entry * delta_entry,u8 * name)1481 int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
1482 {
1483 	int result;
1484 
1485 	result = assert_not_at_end(delta_entry);
1486 	if (result != UDS_SUCCESS)
1487 		return result;
1488 
1489 	result = VDO_ASSERT(delta_entry->is_collision,
1490 			    "Cannot get full block name from a non-collision delta index entry");
1491 	if (result != VDO_SUCCESS)
1492 		return UDS_BAD_STATE;
1493 
1494 	get_collision_name(delta_entry, name);
1495 	return UDS_SUCCESS;
1496 }
1497 
uds_get_delta_entry_value(const struct delta_index_entry * delta_entry)1498 u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
1499 {
1500 	return get_field(delta_entry->delta_zone->memory,
1501 			 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1502 }
1503 
assert_mutable_entry(const struct delta_index_entry * delta_entry)1504 static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
1505 {
1506 	int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
1507 			        "delta index is mutable");
1508 	if (result != VDO_SUCCESS)
1509 		result = UDS_BAD_STATE;
1510 
1511 	return result;
1512 }
1513 
uds_set_delta_entry_value(const struct delta_index_entry * delta_entry,u32 value)1514 int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
1515 {
1516 	int result;
1517 	u32 value_mask = (1 << delta_entry->value_bits) - 1;
1518 
1519 	result = assert_mutable_entry(delta_entry);
1520 	if (result != UDS_SUCCESS)
1521 		return result;
1522 
1523 	result = assert_not_at_end(delta_entry);
1524 	if (result != UDS_SUCCESS)
1525 		return result;
1526 
1527 	result = VDO_ASSERT((value & value_mask) == value,
1528 			    "Value (%u) being set in a delta index is too large (must fit in %u bits)",
1529 			    value, delta_entry->value_bits);
1530 	if (result != VDO_SUCCESS)
1531 		return UDS_INVALID_ARGUMENT;
1532 
1533 	set_field(value, delta_entry->delta_zone->memory,
1534 		  get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1535 	return UDS_SUCCESS;
1536 }
1537 
1538 /*
1539  * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
1540  * by growing_index, then rebalancing the lists in the new chunk.
1541  */
extend_delta_zone(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size)1542 static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
1543 			     size_t growing_size)
1544 {
1545 	ktime_t start_time;
1546 	ktime_t end_time;
1547 	struct delta_list *delta_lists;
1548 	u32 i;
1549 	size_t used_space;
1550 
1551 
1552 	/* Calculate the amount of space that is or will be in use. */
1553 	start_time = current_time_ns(CLOCK_MONOTONIC);
1554 	delta_lists = delta_zone->delta_lists;
1555 	used_space = growing_size;
1556 	for (i = 0; i <= delta_zone->list_count + 1; i++)
1557 		used_space += get_delta_list_byte_size(&delta_lists[i]);
1558 
1559 	if (delta_zone->size < used_space)
1560 		return UDS_OVERFLOW;
1561 
1562 	/* Compute the new offsets of the delta lists. */
1563 	compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
1564 
1565 	/*
1566 	 * When we rebalance the delta list, we will include the end guard list in the rebalancing.
1567 	 * It contains the end guard data, which must be copied.
1568 	 */
1569 	rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
1570 	end_time = current_time_ns(CLOCK_MONOTONIC);
1571 	delta_zone->rebalance_count++;
1572 	delta_zone->rebalance_time += ktime_sub(end_time, start_time);
1573 	return UDS_SUCCESS;
1574 }
1575 
insert_bits(struct delta_index_entry * delta_entry,u16 size)1576 static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
1577 {
1578 	u64 free_before;
1579 	u64 free_after;
1580 	u64 source;
1581 	u64 destination;
1582 	u32 count;
1583 	bool before_flag;
1584 	u8 *memory;
1585 	struct delta_zone *delta_zone = delta_entry->delta_zone;
1586 	struct delta_list *delta_list = delta_entry->delta_list;
1587 	/* Compute bits in use before and after the inserted bits. */
1588 	u32 total_size = delta_list->size;
1589 	u32 before_size = delta_entry->offset;
1590 	u32 after_size = total_size - delta_entry->offset;
1591 
1592 	if (total_size + size > U16_MAX) {
1593 		delta_entry->list_overflow = true;
1594 		delta_zone->overflow_count++;
1595 		return UDS_OVERFLOW;
1596 	}
1597 
1598 	/* Compute bits available before and after the delta list. */
1599 	free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1600 	free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1601 
1602 	if ((size <= free_before) && (size <= free_after)) {
1603 		/*
1604 		 * We have enough space to use either before or after the list. Select the smaller
1605 		 * amount of data. If it is exactly the same, try to take from the larger amount of
1606 		 * free space.
1607 		 */
1608 		if (before_size < after_size)
1609 			before_flag = true;
1610 		else if (after_size < before_size)
1611 			before_flag = false;
1612 		else
1613 			before_flag = free_before > free_after;
1614 	} else if (size <= free_before) {
1615 		/* There is space before but not after. */
1616 		before_flag = true;
1617 	} else if (size <= free_after) {
1618 		/* There is space after but not before. */
1619 		before_flag = false;
1620 	} else {
1621 		/*
1622 		 * Neither of the surrounding spaces is large enough for this request. Extend
1623 		 * and/or rebalance the delta list memory choosing to move the least amount of
1624 		 * data.
1625 		 */
1626 		int result;
1627 		u32 growing_index = delta_entry->list_number + 1;
1628 
1629 		before_flag = before_size < after_size;
1630 		if (!before_flag)
1631 			growing_index++;
1632 		result = extend_delta_zone(delta_zone, growing_index,
1633 					   BITS_TO_BYTES(size));
1634 		if (result != UDS_SUCCESS)
1635 			return result;
1636 	}
1637 
1638 	delta_list->size += size;
1639 	if (before_flag) {
1640 		source = delta_list->start;
1641 		destination = source - size;
1642 		delta_list->start -= size;
1643 		count = before_size;
1644 	} else {
1645 		source = delta_list->start + delta_entry->offset;
1646 		destination = source + size;
1647 		count = after_size;
1648 	}
1649 
1650 	memory = delta_zone->memory;
1651 	move_bits(memory, source, memory, destination, count);
1652 	return UDS_SUCCESS;
1653 }
1654 
encode_delta(const struct delta_index_entry * delta_entry)1655 static void encode_delta(const struct delta_index_entry *delta_entry)
1656 {
1657 	u32 temp;
1658 	u32 t1;
1659 	u32 t2;
1660 	u64 offset;
1661 	const struct delta_zone *delta_zone = delta_entry->delta_zone;
1662 	u8 *memory = delta_zone->memory;
1663 
1664 	offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1665 	if (delta_entry->delta < delta_zone->min_keys) {
1666 		set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
1667 		return;
1668 	}
1669 
1670 	temp = delta_entry->delta - delta_zone->min_keys;
1671 	t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
1672 	t2 = temp / delta_zone->incr_keys;
1673 	set_field(t1, memory, offset, delta_zone->min_bits);
1674 	set_zero(memory, offset + delta_zone->min_bits, t2);
1675 	set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
1676 }
1677 
encode_entry(const struct delta_index_entry * delta_entry,u32 value,const u8 * name)1678 static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
1679 			 const u8 *name)
1680 {
1681 	u8 *memory = delta_entry->delta_zone->memory;
1682 	u64 offset = get_delta_entry_offset(delta_entry);
1683 
1684 	set_field(value, memory, offset, delta_entry->value_bits);
1685 	encode_delta(delta_entry);
1686 	if (name != NULL)
1687 		set_collision_name(delta_entry, name);
1688 }
1689 
1690 /*
1691  * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
1692  * be provided.
1693  */
uds_put_delta_index_entry(struct delta_index_entry * delta_entry,u32 key,u32 value,const u8 * name)1694 int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
1695 			      const u8 *name)
1696 {
1697 	int result;
1698 	struct delta_zone *delta_zone;
1699 
1700 	result = assert_mutable_entry(delta_entry);
1701 	if (result != UDS_SUCCESS)
1702 		return result;
1703 
1704 	if (delta_entry->is_collision) {
1705 		/*
1706 		 * The caller wants us to insert a collision entry onto a collision entry. This
1707 		 * happens when we find a collision and attempt to add the name again to the index.
1708 		 * This is normally a fatal error unless we are replaying a closed chapter while we
1709 		 * are rebuilding a volume index.
1710 		 */
1711 		return UDS_DUPLICATE_NAME;
1712 	}
1713 
1714 	if (delta_entry->offset < delta_entry->delta_list->save_offset) {
1715 		/*
1716 		 * The saved entry offset is after the new entry and will no longer be valid, so
1717 		 * replace it with the insertion point.
1718 		 */
1719 		result = uds_remember_delta_index_offset(delta_entry);
1720 		if (result != UDS_SUCCESS)
1721 			return result;
1722 	}
1723 
1724 	if (name != NULL) {
1725 		/* Insert a collision entry which is placed after this entry. */
1726 		result = assert_not_at_end(delta_entry);
1727 		if (result != UDS_SUCCESS)
1728 			return result;
1729 
1730 		result = VDO_ASSERT((key == delta_entry->key),
1731 				    "incorrect key for collision entry");
1732 		if (result != VDO_SUCCESS)
1733 			return result;
1734 
1735 		delta_entry->offset += delta_entry->entry_bits;
1736 		set_delta(delta_entry, 0);
1737 		delta_entry->is_collision = true;
1738 		delta_entry->entry_bits += COLLISION_BITS;
1739 		result = insert_bits(delta_entry, delta_entry->entry_bits);
1740 	} else if (delta_entry->at_end) {
1741 		/* Insert a new entry at the end of the delta list. */
1742 		result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
1743 		if (result != VDO_SUCCESS)
1744 			return result;
1745 
1746 		set_delta(delta_entry, key - delta_entry->key);
1747 		delta_entry->key = key;
1748 		delta_entry->at_end = false;
1749 		result = insert_bits(delta_entry, delta_entry->entry_bits);
1750 	} else {
1751 		u16 old_entry_size;
1752 		u16 additional_size;
1753 		struct delta_index_entry next_entry;
1754 		u32 next_value;
1755 
1756 		/*
1757 		 * Insert a new entry which requires the delta in the following entry to be
1758 		 * updated.
1759 		 */
1760 		result = VDO_ASSERT((key < delta_entry->key),
1761 				    "key precedes following entry");
1762 		if (result != VDO_SUCCESS)
1763 			return result;
1764 
1765 		result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
1766 				    "key effects following entry's delta");
1767 		if (result != VDO_SUCCESS)
1768 			return result;
1769 
1770 		old_entry_size = delta_entry->entry_bits;
1771 		next_entry = *delta_entry;
1772 		next_value = uds_get_delta_entry_value(&next_entry);
1773 		set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
1774 		delta_entry->key = key;
1775 		set_delta(&next_entry, next_entry.key - key);
1776 		next_entry.offset += delta_entry->entry_bits;
1777 		/* The two new entries are always bigger than the single entry being replaced. */
1778 		additional_size = (delta_entry->entry_bits +
1779 				   next_entry.entry_bits - old_entry_size);
1780 		result = insert_bits(delta_entry, additional_size);
1781 		if (result != UDS_SUCCESS)
1782 			return result;
1783 
1784 		encode_entry(&next_entry, next_value, NULL);
1785 	}
1786 
1787 	if (result != UDS_SUCCESS)
1788 		return result;
1789 
1790 	encode_entry(delta_entry, value, name);
1791 	delta_zone = delta_entry->delta_zone;
1792 	delta_zone->record_count++;
1793 	delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
1794 	return UDS_SUCCESS;
1795 }
1796 
delete_bits(const struct delta_index_entry * delta_entry,int size)1797 static void delete_bits(const struct delta_index_entry *delta_entry, int size)
1798 {
1799 	u64 source;
1800 	u64 destination;
1801 	u32 count;
1802 	bool before_flag;
1803 	struct delta_list *delta_list = delta_entry->delta_list;
1804 	u8 *memory = delta_entry->delta_zone->memory;
1805 	/* Compute bits retained before and after the deleted bits. */
1806 	u32 total_size = delta_list->size;
1807 	u32 before_size = delta_entry->offset;
1808 	u32 after_size = total_size - delta_entry->offset - size;
1809 
1810 	/*
1811 	 * Determine whether to add to the available space either before or after the delta list.
1812 	 * We prefer to move the least amount of data. If it is exactly the same, try to add to the
1813 	 * smaller amount of free space.
1814 	 */
1815 	if (before_size < after_size) {
1816 		before_flag = true;
1817 	} else if (after_size < before_size) {
1818 		before_flag = false;
1819 	} else {
1820 		u64 free_before =
1821 			(delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1822 		u64 free_after =
1823 			(delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1824 
1825 		before_flag = (free_before < free_after);
1826 	}
1827 
1828 	delta_list->size -= size;
1829 	if (before_flag) {
1830 		source = delta_list->start;
1831 		destination = source + size;
1832 		delta_list->start += size;
1833 		count = before_size;
1834 	} else {
1835 		destination = delta_list->start + delta_entry->offset;
1836 		source = destination + size;
1837 		count = after_size;
1838 	}
1839 
1840 	move_bits(memory, source, memory, destination, count);
1841 }
1842 
uds_remove_delta_index_entry(struct delta_index_entry * delta_entry)1843 int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
1844 {
1845 	int result;
1846 	struct delta_index_entry next_entry;
1847 	struct delta_zone *delta_zone;
1848 	struct delta_list *delta_list;
1849 
1850 	result = assert_mutable_entry(delta_entry);
1851 	if (result != UDS_SUCCESS)
1852 		return result;
1853 
1854 	next_entry = *delta_entry;
1855 	result = uds_next_delta_index_entry(&next_entry);
1856 	if (result != UDS_SUCCESS)
1857 		return result;
1858 
1859 	delta_zone = delta_entry->delta_zone;
1860 
1861 	if (delta_entry->is_collision) {
1862 		/* This is a collision entry, so just remove it. */
1863 		delete_bits(delta_entry, delta_entry->entry_bits);
1864 		next_entry.offset = delta_entry->offset;
1865 		delta_zone->collision_count -= 1;
1866 	} else if (next_entry.at_end) {
1867 		/* This entry is at the end of the list, so just remove it. */
1868 		delete_bits(delta_entry, delta_entry->entry_bits);
1869 		next_entry.key -= delta_entry->delta;
1870 		next_entry.offset = delta_entry->offset;
1871 	} else {
1872 		/* The delta in the next entry needs to be updated. */
1873 		u32 next_value = uds_get_delta_entry_value(&next_entry);
1874 		u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
1875 
1876 		if (next_entry.is_collision) {
1877 			next_entry.is_collision = false;
1878 			delta_zone->collision_count -= 1;
1879 		}
1880 
1881 		set_delta(&next_entry, delta_entry->delta + next_entry.delta);
1882 		next_entry.offset = delta_entry->offset;
1883 		/* The one new entry is always smaller than the two entries being replaced. */
1884 		delete_bits(delta_entry, old_size - next_entry.entry_bits);
1885 		encode_entry(&next_entry, next_value, NULL);
1886 	}
1887 
1888 	delta_zone->record_count--;
1889 	delta_zone->discard_count++;
1890 	*delta_entry = next_entry;
1891 
1892 	delta_list = delta_entry->delta_list;
1893 	if (delta_entry->offset < delta_list->save_offset) {
1894 		/* The saved entry offset is no longer valid. */
1895 		delta_list->save_key = 0;
1896 		delta_list->save_offset = 0;
1897 	}
1898 
1899 	return UDS_SUCCESS;
1900 }
1901 
uds_get_delta_index_stats(const struct delta_index * delta_index,struct delta_index_stats * stats)1902 void uds_get_delta_index_stats(const struct delta_index *delta_index,
1903 			       struct delta_index_stats *stats)
1904 {
1905 	unsigned int z;
1906 	const struct delta_zone *delta_zone;
1907 
1908 	memset(stats, 0, sizeof(struct delta_index_stats));
1909 	for (z = 0; z < delta_index->zone_count; z++) {
1910 		delta_zone = &delta_index->delta_zones[z];
1911 		stats->rebalance_time += delta_zone->rebalance_time;
1912 		stats->rebalance_count += delta_zone->rebalance_count;
1913 		stats->record_count += delta_zone->record_count;
1914 		stats->collision_count += delta_zone->collision_count;
1915 		stats->discard_count += delta_zone->discard_count;
1916 		stats->overflow_count += delta_zone->overflow_count;
1917 		stats->list_count += delta_zone->list_count;
1918 	}
1919 }
1920 
uds_compute_delta_index_size(u32 entry_count,u32 mean_delta,u32 payload_bits)1921 size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
1922 {
1923 	u16 min_bits;
1924 	u32 incr_keys;
1925 	u32 min_keys;
1926 
1927 	compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
1928 	/* On average, each delta is encoded into about min_bits + 1.5 bits. */
1929 	return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
1930 }
1931 
uds_get_delta_index_page_count(u32 entry_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t bytes_per_page)1932 u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
1933 				   u32 payload_bits, size_t bytes_per_page)
1934 {
1935 	unsigned int bits_per_delta_list;
1936 	unsigned int bits_per_page;
1937 	size_t bits_per_index;
1938 
1939 	/* Compute the expected number of bits needed for all the entries. */
1940 	bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
1941 						      payload_bits);
1942 	bits_per_delta_list = bits_per_index / list_count;
1943 
1944 	/* Add in the immutable delta list headers. */
1945 	bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
1946 	/* Compute the number of usable bits on an immutable index page. */
1947 	bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
1948 	/*
1949 	 * Reduce the bits per page by one immutable delta list header and one delta list to
1950 	 * account for internal fragmentation.
1951 	 */
1952 	bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
1953 	/* Now compute the number of pages needed. */
1954 	return DIV_ROUND_UP(bits_per_index, bits_per_page);
1955 }
1956 
uds_log_delta_index_entry(struct delta_index_entry * delta_entry)1957 void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
1958 {
1959 	vdo_log_ratelimit(vdo_log_info,
1960 			  "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
1961 			  delta_entry->list_number, delta_entry->key,
1962 			  delta_entry->offset, delta_entry->at_end ? " end" : "",
1963 			  delta_entry->is_collision ? " collision" : "",
1964 			  delta_entry->delta_list->size,
1965 			  delta_entry->list_overflow ? " overflow" : "");
1966 	delta_entry->list_overflow = false;
1967 }
1968