1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2023 Red Hat
4 */
5 #include "delta-index.h"
6
7 #include <linux/bitops.h>
8 #include <linux/bits.h>
9 #include <linux/compiler.h>
10 #include <linux/limits.h>
11 #include <linux/log2.h>
12
13 #include "cpu.h"
14 #include "errors.h"
15 #include "logger.h"
16 #include "memory-alloc.h"
17 #include "numeric.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 #include "time-utils.h"
21
22 #include "config.h"
23 #include "indexer.h"
24
25 /*
26 * The entries in a delta index could be stored in a single delta list, but to reduce search times
27 * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
28 * memory managed by the delta_zone structure. The delta_zone can move the data around within its
29 * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
30 * the volume index can contain over a million delta lists, we want to be efficient with the size
31 * of the delta list header information. This information is encoded into 16 bytes per list. The
32 * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
33 * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
34 * sufficient to store the size of a delta list.
35 *
36 * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
37 * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
38 * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit
39 * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
40 * number corresponds to its index in the array.
41 *
42 * A standard delta list entry is stored as a fixed length payload (the value) followed by a
43 * variable length key (the delta). A collision entry is used when two block names have the same
44 * delta list address. A collision entry always follows a standard entry for the hash with which it
45 * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
46 * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
47 * indicates a normal entry.
48 *
49 * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
50 * used by small deltas. The Huffman code is specified by three parameters, which can be computed
51 * from the desired mean delta when the index is full. (See compute_coding_constants() for
52 * details.)
53 *
54 * The bit field utilities used to read and write delta entries assume that it is possible to read
55 * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
56 * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
57 * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
58 * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
59 * delta list could cause this step to run off the end of the delta_zone memory, so as extra
60 * protection against this happening, the tail guard list is set to all ones.
61 *
62 * The delta_index supports two different forms. The mutable form is created by
63 * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
64 * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
65 * cached) chapter index pages. The immutable form does not allocate delta list headers or
66 * temporary offsets, and thus is somewhat more memory efficient.
67 */
68
69 /*
70 * This is the largest field size supported by get_field() and set_field(). Any field that is
71 * larger is not guaranteed to fit in a single byte-aligned u32.
72 */
73 #define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1)
74
75 /*
76 * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
77 * is larger is not guaranteed to fit in a single byte-aligned u64.
78 */
79 #define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1)
80
81 /*
82 * This is the number of guard bytes needed at the end of the memory byte array when using the bit
83 * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
84 * bytes beyond the end of the desired field. The definition is written to make it clear how this
85 * value is derived.
86 */
87 #define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1)
88
89 /* The number of guard bits that are needed in the tail guard list */
90 #define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE)
91
92 /*
93 * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
94 * buffer of this size can be used with move_bits().
95 */
96 #define DELTA_LIST_MAX_BYTE_COUNT \
97 ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
98
99 /* The number of extra bytes and bits needed to store a collision entry */
100 #define COLLISION_BYTES UDS_RECORD_NAME_SIZE
101 #define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE)
102
103 /*
104 * Immutable delta lists are packed into pages containing a header that encodes the delta list
105 * information into 19 bits per list (64KB bit offset).
106 */
107 #define IMMUTABLE_HEADER_SIZE 19
108
109 /*
110 * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
111 * number to increment when the format of the data changes.
112 */
113 #define MAGIC_SIZE 8
114
115 static const char DELTA_INDEX_MAGIC[] = "DI-00002";
116
117 struct delta_index_header {
118 char magic[MAGIC_SIZE];
119 u32 zone_number;
120 u32 zone_count;
121 u32 first_list;
122 u32 list_count;
123 u64 record_count;
124 u64 collision_count;
125 };
126
127 /*
128 * Header data used for immutable delta index pages. This data is followed by the delta list offset
129 * table.
130 */
131 struct delta_page_header {
132 /* Externally-defined nonce */
133 u64 nonce;
134 /* The virtual chapter number */
135 u64 virtual_chapter_number;
136 /* Index of the first delta list on the page */
137 u16 first_list;
138 /* Number of delta lists on the page */
139 u16 list_count;
140 } __packed;
141
get_delta_list_byte_start(const struct delta_list * delta_list)142 static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
143 {
144 return delta_list->start / BITS_PER_BYTE;
145 }
146
get_delta_list_byte_size(const struct delta_list * delta_list)147 static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
148 {
149 unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
150
151 return BITS_TO_BYTES(bit_offset + delta_list->size);
152 }
153
rebalance_delta_zone(const struct delta_zone * delta_zone,u32 first,u32 last)154 static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
155 u32 last)
156 {
157 struct delta_list *delta_list;
158 u64 new_start;
159
160 if (first == last) {
161 /* Only one list is moving, and we know there is space. */
162 delta_list = &delta_zone->delta_lists[first];
163 new_start = delta_zone->new_offsets[first];
164 if (delta_list->start != new_start) {
165 u64 source;
166 u64 destination;
167
168 source = get_delta_list_byte_start(delta_list);
169 delta_list->start = new_start;
170 destination = get_delta_list_byte_start(delta_list);
171 memmove(delta_zone->memory + destination,
172 delta_zone->memory + source,
173 get_delta_list_byte_size(delta_list));
174 }
175 } else {
176 /*
177 * There is more than one list. Divide the problem in half, and use recursive calls
178 * to process each half. Note that after this computation, first <= middle, and
179 * middle < last.
180 */
181 u32 middle = (first + last) / 2;
182
183 delta_list = &delta_zone->delta_lists[middle];
184 new_start = delta_zone->new_offsets[middle];
185
186 /*
187 * The direction that our middle list is moving determines which half of the
188 * problem must be processed first.
189 */
190 if (new_start > delta_list->start) {
191 rebalance_delta_zone(delta_zone, middle + 1, last);
192 rebalance_delta_zone(delta_zone, first, middle);
193 } else {
194 rebalance_delta_zone(delta_zone, first, middle);
195 rebalance_delta_zone(delta_zone, middle + 1, last);
196 }
197 }
198 }
199
get_zone_memory_size(unsigned int zone_count,size_t memory_size)200 static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
201 {
202 /* Round up so that each zone is a multiple of 64K in size. */
203 size_t ALLOC_BOUNDARY = 64 * 1024;
204
205 return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
206 }
207
uds_reset_delta_index(const struct delta_index * delta_index)208 void uds_reset_delta_index(const struct delta_index *delta_index)
209 {
210 unsigned int z;
211
212 /*
213 * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
214 * before the first real entry and one after so that we don't need to bounds check the
215 * array access when calculating preceding and following gap sizes.
216 */
217 for (z = 0; z < delta_index->zone_count; z++) {
218 u64 list_bits;
219 u64 spacing;
220 u64 offset;
221 unsigned int i;
222 struct delta_zone *zone = &delta_index->delta_zones[z];
223 struct delta_list *delta_lists = zone->delta_lists;
224
225 /* Zeroing the delta list headers initializes the head guard list correctly. */
226 memset(delta_lists, 0,
227 (zone->list_count + 2) * sizeof(struct delta_list));
228
229 /* Set all the bits in the end guard list. */
230 list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
231 delta_lists[zone->list_count + 1].start = list_bits;
232 delta_lists[zone->list_count + 1].size = GUARD_BITS;
233 memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
234 POST_FIELD_GUARD_BYTES);
235
236 /* Evenly space out the real delta lists by setting regular offsets. */
237 spacing = list_bits / zone->list_count;
238 offset = spacing / 2;
239 for (i = 1; i <= zone->list_count; i++) {
240 delta_lists[i].start = offset;
241 offset += spacing;
242 }
243
244 /* Update the statistics. */
245 zone->discard_count += zone->record_count;
246 zone->record_count = 0;
247 zone->collision_count = 0;
248 }
249 }
250
251 /* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
252 * three parameters:
253 *
254 * MINBITS The number of bits in the smallest code
255 * BASE The number of values coded using a code of length MINBITS
256 * INCR The number of values coded by using one additional bit
257 *
258 * These parameters are related by this equation:
259 *
260 * BASE + INCR == 1 << MINBITS
261 *
262 * The math for the Huffman code of an exponential distribution says that
263 *
264 * INCR = log(2) * MEAN_DELTA
265 *
266 * Then use the smallest MINBITS value so that
267 *
268 * (1 << MINBITS) > INCR
269 *
270 * And then
271 *
272 * BASE = (1 << MINBITS) - INCR
273 *
274 * Now the index can generate a code such that
275 * - The first BASE values code using MINBITS bits.
276 * - The next INCR values code using MINBITS+1 bits.
277 * - The next INCR values code using MINBITS+2 bits.
278 * - (and so on).
279 */
compute_coding_constants(u32 mean_delta,u16 * min_bits,u32 * min_keys,u32 * incr_keys)280 static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
281 {
282 /*
283 * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
284 * floating point, use a really good integer approximation.
285 */
286 *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
287 *min_bits = bits_per(*incr_keys + 1);
288 *min_keys = (1 << *min_bits) - *incr_keys;
289 }
290
uds_uninitialize_delta_index(struct delta_index * delta_index)291 void uds_uninitialize_delta_index(struct delta_index *delta_index)
292 {
293 unsigned int z;
294
295 if (delta_index->delta_zones == NULL)
296 return;
297
298 for (z = 0; z < delta_index->zone_count; z++) {
299 vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets));
300 vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists));
301 vdo_free(vdo_forget(delta_index->delta_zones[z].memory));
302 }
303
304 vdo_free(delta_index->delta_zones);
305 memset(delta_index, 0, sizeof(struct delta_index));
306 }
307
initialize_delta_zone(struct delta_zone * delta_zone,size_t size,u32 first_list,u32 list_count,u32 mean_delta,u32 payload_bits,u8 tag)308 static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
309 u32 first_list, u32 list_count, u32 mean_delta,
310 u32 payload_bits, u8 tag)
311 {
312 int result;
313
314 result = vdo_allocate(size, "delta list", &delta_zone->memory);
315 if (result != VDO_SUCCESS)
316 return result;
317
318 result = vdo_allocate(list_count + 2, "delta list temp", &delta_zone->new_offsets);
319 if (result != VDO_SUCCESS)
320 return result;
321
322 /* Allocate the delta lists. */
323 result = vdo_allocate(list_count + 2, "delta lists", &delta_zone->delta_lists);
324 if (result != VDO_SUCCESS)
325 return result;
326
327 compute_coding_constants(mean_delta, &delta_zone->min_bits,
328 &delta_zone->min_keys, &delta_zone->incr_keys);
329 delta_zone->value_bits = payload_bits;
330 delta_zone->buffered_writer = NULL;
331 delta_zone->size = size;
332 delta_zone->rebalance_time = 0;
333 delta_zone->rebalance_count = 0;
334 delta_zone->record_count = 0;
335 delta_zone->collision_count = 0;
336 delta_zone->discard_count = 0;
337 delta_zone->overflow_count = 0;
338 delta_zone->first_list = first_list;
339 delta_zone->list_count = list_count;
340 delta_zone->tag = tag;
341
342 return UDS_SUCCESS;
343 }
344
uds_initialize_delta_index(struct delta_index * delta_index,unsigned int zone_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t memory_size,u8 tag)345 int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
346 u32 list_count, u32 mean_delta, u32 payload_bits,
347 size_t memory_size, u8 tag)
348 {
349 int result;
350 unsigned int z;
351 size_t zone_memory;
352
353 result = vdo_allocate(zone_count, "Delta Index Zones", &delta_index->delta_zones);
354 if (result != VDO_SUCCESS)
355 return result;
356
357 delta_index->zone_count = zone_count;
358 delta_index->list_count = list_count;
359 delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
360 delta_index->memory_size = 0;
361 delta_index->mutable = true;
362 delta_index->tag = tag;
363
364 for (z = 0; z < zone_count; z++) {
365 u32 lists_in_zone = delta_index->lists_per_zone;
366 u32 first_list_in_zone = z * lists_in_zone;
367
368 if (z == zone_count - 1) {
369 /*
370 * The last zone gets fewer lists if zone_count doesn't evenly divide
371 * list_count. We'll have an underflow if the assertion below doesn't hold.
372 */
373 if (delta_index->list_count <= first_list_in_zone) {
374 uds_uninitialize_delta_index(delta_index);
375 return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
376 "%u delta lists not enough for %u zones",
377 list_count, zone_count);
378 }
379 lists_in_zone = delta_index->list_count - first_list_in_zone;
380 }
381
382 zone_memory = get_zone_memory_size(zone_count, memory_size);
383 result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
384 first_list_in_zone, lists_in_zone,
385 mean_delta, payload_bits, tag);
386 if (result != UDS_SUCCESS) {
387 uds_uninitialize_delta_index(delta_index);
388 return result;
389 }
390
391 delta_index->memory_size +=
392 (sizeof(struct delta_zone) + zone_memory +
393 (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
394 }
395
396 uds_reset_delta_index(delta_index);
397 return UDS_SUCCESS;
398 }
399
400 /* Read a bit field from an arbitrary bit boundary. */
get_field(const u8 * memory,u64 offset,u8 size)401 static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
402 {
403 const void *addr = memory + offset / BITS_PER_BYTE;
404
405 return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
406 }
407
408 /* Write a bit field to an arbitrary bit boundary. */
set_field(u32 value,u8 * memory,u64 offset,u8 size)409 static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
410 {
411 void *addr = memory + offset / BITS_PER_BYTE;
412 int shift = offset % BITS_PER_BYTE;
413 u32 data = get_unaligned_le32(addr);
414
415 data &= ~(((1 << size) - 1) << shift);
416 data |= value << shift;
417 put_unaligned_le32(data, addr);
418 }
419
420 /* Get the bit offset to the immutable delta list header. */
get_immutable_header_offset(u32 list_number)421 static inline u32 get_immutable_header_offset(u32 list_number)
422 {
423 return sizeof(struct delta_page_header) * BITS_PER_BYTE +
424 list_number * IMMUTABLE_HEADER_SIZE;
425 }
426
427 /* Get the bit offset to the start of the immutable delta list bit stream. */
get_immutable_start(const u8 * memory,u32 list_number)428 static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
429 {
430 return get_field(memory, get_immutable_header_offset(list_number),
431 IMMUTABLE_HEADER_SIZE);
432 }
433
434 /* Set the bit offset to the start of the immutable delta list bit stream. */
set_immutable_start(u8 * memory,u32 list_number,u32 start)435 static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
436 {
437 set_field(start, memory, get_immutable_header_offset(list_number),
438 IMMUTABLE_HEADER_SIZE);
439 }
440
verify_delta_index_page(u64 nonce,u16 list_count,u64 expected_nonce,u8 * memory,size_t memory_size)441 static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
442 u8 *memory, size_t memory_size)
443 {
444 unsigned int i;
445
446 /*
447 * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
448 * entire volume at least once.
449 */
450 if (nonce != expected_nonce)
451 return false;
452
453 /* Verify that the number of delta lists can fit in the page. */
454 if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
455 BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
456 return false;
457
458 /*
459 * Verify that the first delta list is immediately after the last delta
460 * list header.
461 */
462 if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
463 return false;
464
465 /* Verify that the lists are in the correct order. */
466 for (i = 0; i < list_count; i++) {
467 if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
468 return false;
469 }
470
471 /*
472 * Verify that the last list ends on the page, and that there is room
473 * for the post-field guard bits.
474 */
475 if (get_immutable_start(memory, list_count) >
476 (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
477 return false;
478
479 /* Verify that the guard bytes are correctly set to all ones. */
480 for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
481 if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
482 return false;
483 }
484
485 /* All verifications passed. */
486 return true;
487 }
488
489 /* Initialize a delta index page to refer to a supplied page. */
uds_initialize_delta_index_page(struct delta_index_page * delta_index_page,u64 expected_nonce,u32 mean_delta,u32 payload_bits,u8 * memory,size_t memory_size)490 int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
491 u64 expected_nonce, u32 mean_delta, u32 payload_bits,
492 u8 *memory, size_t memory_size)
493 {
494 u64 nonce;
495 u64 vcn;
496 u64 first_list;
497 u64 list_count;
498 struct delta_page_header *header = (struct delta_page_header *) memory;
499 struct delta_zone *delta_zone = &delta_index_page->delta_zone;
500 const u8 *nonce_addr = (const u8 *) &header->nonce;
501 const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
502 const u8 *first_list_addr = (const u8 *) &header->first_list;
503 const u8 *list_count_addr = (const u8 *) &header->list_count;
504
505 /* First assume that the header is little endian. */
506 nonce = get_unaligned_le64(nonce_addr);
507 vcn = get_unaligned_le64(vcn_addr);
508 first_list = get_unaligned_le16(first_list_addr);
509 list_count = get_unaligned_le16(list_count_addr);
510 if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
511 memory_size)) {
512 /* If that fails, try big endian. */
513 nonce = get_unaligned_be64(nonce_addr);
514 vcn = get_unaligned_be64(vcn_addr);
515 first_list = get_unaligned_be16(first_list_addr);
516 list_count = get_unaligned_be16(list_count_addr);
517 if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
518 memory_size)) {
519 /*
520 * Both attempts failed. Do not log this as an error, because it can happen
521 * during a rebuild if we haven't written the entire volume at least once.
522 */
523 return UDS_CORRUPT_DATA;
524 }
525 }
526
527 delta_index_page->delta_index.delta_zones = delta_zone;
528 delta_index_page->delta_index.zone_count = 1;
529 delta_index_page->delta_index.list_count = list_count;
530 delta_index_page->delta_index.lists_per_zone = list_count;
531 delta_index_page->delta_index.mutable = false;
532 delta_index_page->delta_index.tag = 'p';
533 delta_index_page->virtual_chapter_number = vcn;
534 delta_index_page->lowest_list_number = first_list;
535 delta_index_page->highest_list_number = first_list + list_count - 1;
536
537 compute_coding_constants(mean_delta, &delta_zone->min_bits,
538 &delta_zone->min_keys, &delta_zone->incr_keys);
539 delta_zone->value_bits = payload_bits;
540 delta_zone->memory = memory;
541 delta_zone->delta_lists = NULL;
542 delta_zone->new_offsets = NULL;
543 delta_zone->buffered_writer = NULL;
544 delta_zone->size = memory_size;
545 delta_zone->rebalance_time = 0;
546 delta_zone->rebalance_count = 0;
547 delta_zone->record_count = 0;
548 delta_zone->collision_count = 0;
549 delta_zone->discard_count = 0;
550 delta_zone->overflow_count = 0;
551 delta_zone->first_list = 0;
552 delta_zone->list_count = list_count;
553 delta_zone->tag = 'p';
554
555 return UDS_SUCCESS;
556 }
557
558 /* Read a large bit field from an arbitrary bit boundary. */
get_big_field(const u8 * memory,u64 offset,u8 size)559 static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
560 {
561 const void *addr = memory + offset / BITS_PER_BYTE;
562
563 return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
564 }
565
566 /* Write a large bit field to an arbitrary bit boundary. */
set_big_field(u64 value,u8 * memory,u64 offset,u8 size)567 static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
568 {
569 void *addr = memory + offset / BITS_PER_BYTE;
570 u8 shift = offset % BITS_PER_BYTE;
571 u64 data = get_unaligned_le64(addr);
572
573 data &= ~(((1UL << size) - 1) << shift);
574 data |= value << shift;
575 put_unaligned_le64(data, addr);
576 }
577
578 /* Set a sequence of bits to all zeros. */
set_zero(u8 * memory,u64 offset,u32 size)579 static inline void set_zero(u8 *memory, u64 offset, u32 size)
580 {
581 if (size > 0) {
582 u8 *addr = memory + offset / BITS_PER_BYTE;
583 u8 shift = offset % BITS_PER_BYTE;
584 u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
585
586 *addr++ &= ~(((1 << count) - 1) << shift);
587 for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
588 *addr++ = 0;
589
590 if (size > 0)
591 *addr &= 0xFF << size;
592 }
593 }
594
595 /*
596 * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
597 * size and memory offsets are measured in bits.
598 */
move_bits_down(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)599 static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
600 {
601 const u8 *source;
602 u8 *destination;
603 u8 offset;
604 u8 count;
605 u64 field;
606
607 /* Start by moving one field that ends on a to int boundary. */
608 count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
609 field = get_big_field(from, from_offset, count);
610 set_big_field(field, to, to_offset, count);
611 from_offset += count;
612 to_offset += count;
613 size -= count;
614
615 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
616 offset = from_offset % BITS_PER_TYPE(u32);
617 source = from + (from_offset - offset) / BITS_PER_BYTE;
618 destination = to + to_offset / BITS_PER_BYTE;
619 while (size > MAX_BIG_FIELD_BITS) {
620 put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
621 source += sizeof(u32);
622 destination += sizeof(u32);
623 from_offset += BITS_PER_TYPE(u32);
624 to_offset += BITS_PER_TYPE(u32);
625 size -= BITS_PER_TYPE(u32);
626 }
627
628 /* Finish up by moving any remaining bits. */
629 if (size > 0) {
630 field = get_big_field(from, from_offset, size);
631 set_big_field(field, to, to_offset, size);
632 }
633 }
634
635 /*
636 * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
637 * size and memory offsets are measured in bits.
638 */
move_bits_up(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)639 static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
640 {
641 const u8 *source;
642 u8 *destination;
643 u8 offset;
644 u8 count;
645 u64 field;
646
647 /* Start by moving one field that begins on a destination int boundary. */
648 count = (to_offset + size) % BITS_PER_TYPE(u32);
649 if (count > 0) {
650 size -= count;
651 field = get_big_field(from, from_offset + size, count);
652 set_big_field(field, to, to_offset + size, count);
653 }
654
655 /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
656 offset = (from_offset + size) % BITS_PER_TYPE(u32);
657 source = from + (from_offset + size - offset) / BITS_PER_BYTE;
658 destination = to + (to_offset + size) / BITS_PER_BYTE;
659 while (size > MAX_BIG_FIELD_BITS) {
660 source -= sizeof(u32);
661 destination -= sizeof(u32);
662 size -= BITS_PER_TYPE(u32);
663 put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
664 }
665
666 /* Finish up by moving any remaining bits. */
667 if (size > 0) {
668 field = get_big_field(from, from_offset, size);
669 set_big_field(field, to, to_offset, size);
670 }
671 }
672
673 /*
674 * Move bits from one field to another. When the fields overlap, behave as if we first move all the
675 * bits from the source to a temporary value, and then move all the bits from the temporary value
676 * to the destination. The size and memory offsets are measured in bits.
677 */
move_bits(const u8 * from,u64 from_offset,u8 * to,u64 to_offset,u32 size)678 static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
679 {
680 u64 field;
681
682 /* A small move doesn't require special handling. */
683 if (size <= MAX_BIG_FIELD_BITS) {
684 if (size > 0) {
685 field = get_big_field(from, from_offset, size);
686 set_big_field(field, to, to_offset, size);
687 }
688
689 return;
690 }
691
692 if (from_offset > to_offset)
693 move_bits_down(from, from_offset, to, to_offset, size);
694 else
695 move_bits_up(from, from_offset, to, to_offset, size);
696 }
697
698 /*
699 * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
700 * lists (starting with a specified list index) is copied from the mutable delta index into a
701 * memory page used in the immutable index. The number of lists copied onto the page is returned in
702 * list_count.
703 */
uds_pack_delta_index_page(const struct delta_index * delta_index,u64 header_nonce,u8 * memory,size_t memory_size,u64 virtual_chapter_number,u32 first_list,u32 * list_count)704 int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
705 u8 *memory, size_t memory_size, u64 virtual_chapter_number,
706 u32 first_list, u32 *list_count)
707 {
708 const struct delta_zone *delta_zone;
709 struct delta_list *delta_lists;
710 u32 max_lists;
711 u32 n_lists = 0;
712 u32 offset;
713 u32 i;
714 int free_bits;
715 int bits;
716 struct delta_page_header *header;
717
718 delta_zone = &delta_index->delta_zones[0];
719 delta_lists = &delta_zone->delta_lists[first_list + 1];
720 max_lists = delta_index->list_count - first_list;
721
722 /*
723 * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
724 * delta list offset, and the guard bytes from the page size to determine how much space is
725 * available for delta lists.
726 */
727 free_bits = memory_size * BITS_PER_BYTE;
728 free_bits -= get_immutable_header_offset(1);
729 free_bits -= GUARD_BITS;
730 if (free_bits < IMMUTABLE_HEADER_SIZE) {
731 /* This page is too small to store any delta lists. */
732 return vdo_log_error_strerror(UDS_OVERFLOW,
733 "Chapter Index Page of %zu bytes is too small",
734 memory_size);
735 }
736
737 while (n_lists < max_lists) {
738 /* Each list requires a delta list offset and the list data. */
739 bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
740 if (bits > free_bits)
741 break;
742
743 n_lists++;
744 free_bits -= bits;
745 }
746
747 *list_count = n_lists;
748
749 header = (struct delta_page_header *) memory;
750 put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
751 put_unaligned_le64(virtual_chapter_number,
752 (u8 *) &header->virtual_chapter_number);
753 put_unaligned_le16(first_list, (u8 *) &header->first_list);
754 put_unaligned_le16(n_lists, (u8 *) &header->list_count);
755
756 /* Construct the delta list offset table. */
757 offset = get_immutable_header_offset(n_lists + 1);
758 set_immutable_start(memory, 0, offset);
759 for (i = 0; i < n_lists; i++) {
760 offset += delta_lists[i].size;
761 set_immutable_start(memory, i + 1, offset);
762 }
763
764 /* Copy the delta list data onto the memory page. */
765 for (i = 0; i < n_lists; i++) {
766 move_bits(delta_zone->memory, delta_lists[i].start, memory,
767 get_immutable_start(memory, i), delta_lists[i].size);
768 }
769
770 /* Set all the bits in the guard bytes. */
771 memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
772 POST_FIELD_GUARD_BYTES);
773 return UDS_SUCCESS;
774 }
775
776 /* Compute the new offsets of the delta lists. */
compute_new_list_offsets(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size,size_t used_space)777 static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
778 size_t growing_size, size_t used_space)
779 {
780 size_t spacing;
781 u32 i;
782 struct delta_list *delta_lists = delta_zone->delta_lists;
783 u32 tail_guard_index = delta_zone->list_count + 1;
784
785 spacing = (delta_zone->size - used_space) / delta_zone->list_count;
786 delta_zone->new_offsets[0] = 0;
787 for (i = 0; i <= delta_zone->list_count; i++) {
788 delta_zone->new_offsets[i + 1] =
789 (delta_zone->new_offsets[i] +
790 get_delta_list_byte_size(&delta_lists[i]) + spacing);
791 delta_zone->new_offsets[i] *= BITS_PER_BYTE;
792 delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
793 if (i == 0)
794 delta_zone->new_offsets[i + 1] -= spacing / 2;
795 if (i + 1 == growing_index)
796 delta_zone->new_offsets[i + 1] += growing_size;
797 }
798
799 delta_zone->new_offsets[tail_guard_index] =
800 (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
801 }
802
rebalance_lists(struct delta_zone * delta_zone)803 static void rebalance_lists(struct delta_zone *delta_zone)
804 {
805 struct delta_list *delta_lists;
806 u32 i;
807 size_t used_space = 0;
808
809 /* Extend and balance memory to receive the delta lists */
810 delta_lists = delta_zone->delta_lists;
811 for (i = 0; i <= delta_zone->list_count + 1; i++)
812 used_space += get_delta_list_byte_size(&delta_lists[i]);
813
814 compute_new_list_offsets(delta_zone, 0, 0, used_space);
815 for (i = 1; i <= delta_zone->list_count + 1; i++)
816 delta_lists[i].start = delta_zone->new_offsets[i];
817 }
818
819 /* Start restoring a delta index from multiple input streams. */
uds_start_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)820 int uds_start_restoring_delta_index(struct delta_index *delta_index,
821 struct buffered_reader **buffered_readers,
822 unsigned int reader_count)
823 {
824 int result;
825 unsigned int zone_count = reader_count;
826 u64 record_count = 0;
827 u64 collision_count = 0;
828 u32 first_list[MAX_ZONES];
829 u32 list_count[MAX_ZONES];
830 unsigned int z;
831 u32 list_next = 0;
832 const struct delta_zone *delta_zone;
833
834 /* Read and validate each header. */
835 for (z = 0; z < zone_count; z++) {
836 struct delta_index_header header;
837 u8 buffer[sizeof(struct delta_index_header)];
838 size_t offset = 0;
839
840 result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
841 sizeof(buffer));
842 if (result != UDS_SUCCESS) {
843 return vdo_log_warning_strerror(result,
844 "failed to read delta index header");
845 }
846
847 memcpy(&header.magic, buffer, MAGIC_SIZE);
848 offset += MAGIC_SIZE;
849 decode_u32_le(buffer, &offset, &header.zone_number);
850 decode_u32_le(buffer, &offset, &header.zone_count);
851 decode_u32_le(buffer, &offset, &header.first_list);
852 decode_u32_le(buffer, &offset, &header.list_count);
853 decode_u64_le(buffer, &offset, &header.record_count);
854 decode_u64_le(buffer, &offset, &header.collision_count);
855
856 result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
857 "%zu bytes decoded of %zu expected", offset,
858 sizeof(struct delta_index_header));
859 if (result != VDO_SUCCESS) {
860 return vdo_log_warning_strerror(result,
861 "failed to read delta index header");
862 }
863
864 if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
865 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
866 "delta index file has bad magic number");
867 }
868
869 if (zone_count != header.zone_count) {
870 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
871 "delta index files contain mismatched zone counts (%u,%u)",
872 zone_count, header.zone_count);
873 }
874
875 if (header.zone_number != z) {
876 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
877 "delta index zone %u found in slot %u",
878 header.zone_number, z);
879 }
880
881 first_list[z] = header.first_list;
882 list_count[z] = header.list_count;
883 record_count += header.record_count;
884 collision_count += header.collision_count;
885
886 if (first_list[z] != list_next) {
887 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
888 "delta index file for zone %u starts with list %u instead of list %u",
889 z, first_list[z], list_next);
890 }
891
892 list_next += list_count[z];
893 }
894
895 if (list_next != delta_index->list_count) {
896 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
897 "delta index files contain %u delta lists instead of %u delta lists",
898 list_next, delta_index->list_count);
899 }
900
901 if (collision_count > record_count) {
902 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
903 "delta index files contain %llu collisions and %llu records",
904 (unsigned long long) collision_count,
905 (unsigned long long) record_count);
906 }
907
908 uds_reset_delta_index(delta_index);
909 delta_index->delta_zones[0].record_count = record_count;
910 delta_index->delta_zones[0].collision_count = collision_count;
911
912 /* Read the delta lists and distribute them to the proper zones. */
913 for (z = 0; z < zone_count; z++) {
914 u32 i;
915
916 delta_index->load_lists[z] = 0;
917 for (i = 0; i < list_count[z]; i++) {
918 u16 delta_list_size;
919 u32 list_number;
920 unsigned int zone_number;
921 u8 size_data[sizeof(u16)];
922
923 result = uds_read_from_buffered_reader(buffered_readers[z],
924 size_data,
925 sizeof(size_data));
926 if (result != UDS_SUCCESS) {
927 return vdo_log_warning_strerror(result,
928 "failed to read delta index size");
929 }
930
931 delta_list_size = get_unaligned_le16(size_data);
932 if (delta_list_size > 0)
933 delta_index->load_lists[z] += 1;
934
935 list_number = first_list[z] + i;
936 zone_number = list_number / delta_index->lists_per_zone;
937 delta_zone = &delta_index->delta_zones[zone_number];
938 list_number -= delta_zone->first_list;
939 delta_zone->delta_lists[list_number + 1].size = delta_list_size;
940 }
941 }
942
943 /* Prepare each zone to start receiving the delta list data. */
944 for (z = 0; z < delta_index->zone_count; z++)
945 rebalance_lists(&delta_index->delta_zones[z]);
946
947 return UDS_SUCCESS;
948 }
949
restore_delta_list_to_zone(struct delta_zone * delta_zone,const struct delta_list_save_info * save_info,const u8 * data)950 static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
951 const struct delta_list_save_info *save_info,
952 const u8 *data)
953 {
954 struct delta_list *delta_list;
955 u16 bit_count;
956 u16 byte_count;
957 u32 list_number = save_info->index - delta_zone->first_list;
958
959 if (list_number >= delta_zone->list_count) {
960 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
961 "invalid delta list number %u not in range [%u,%u)",
962 save_info->index, delta_zone->first_list,
963 delta_zone->first_list + delta_zone->list_count);
964 }
965
966 delta_list = &delta_zone->delta_lists[list_number + 1];
967 if (delta_list->size == 0) {
968 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
969 "unexpected delta list number %u",
970 save_info->index);
971 }
972
973 bit_count = delta_list->size + save_info->bit_offset;
974 byte_count = BITS_TO_BYTES(bit_count);
975 if (save_info->byte_count != byte_count) {
976 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
977 "unexpected delta list size %u != %u",
978 save_info->byte_count, byte_count);
979 }
980
981 move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
982 delta_list->size);
983 return UDS_SUCCESS;
984 }
985
restore_delta_list_data(struct delta_index * delta_index,unsigned int load_zone,struct buffered_reader * buffered_reader,u8 * data)986 static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
987 struct buffered_reader *buffered_reader, u8 *data)
988 {
989 int result;
990 struct delta_list_save_info save_info;
991 u8 buffer[sizeof(struct delta_list_save_info)];
992 unsigned int new_zone;
993
994 result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
995 if (result != UDS_SUCCESS) {
996 return vdo_log_warning_strerror(result,
997 "failed to read delta list data");
998 }
999
1000 save_info = (struct delta_list_save_info) {
1001 .tag = buffer[0],
1002 .bit_offset = buffer[1],
1003 .byte_count = get_unaligned_le16(&buffer[2]),
1004 .index = get_unaligned_le32(&buffer[4]),
1005 };
1006
1007 if ((save_info.bit_offset >= BITS_PER_BYTE) ||
1008 (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
1009 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1010 "corrupt delta list data");
1011 }
1012
1013 /* Make sure the data is intended for this delta index. */
1014 if (save_info.tag != delta_index->tag)
1015 return UDS_CORRUPT_DATA;
1016
1017 if (save_info.index >= delta_index->list_count) {
1018 return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
1019 "invalid delta list number %u of %u",
1020 save_info.index,
1021 delta_index->list_count);
1022 }
1023
1024 result = uds_read_from_buffered_reader(buffered_reader, data,
1025 save_info.byte_count);
1026 if (result != UDS_SUCCESS) {
1027 return vdo_log_warning_strerror(result,
1028 "failed to read delta list data");
1029 }
1030
1031 delta_index->load_lists[load_zone] -= 1;
1032 new_zone = save_info.index / delta_index->lists_per_zone;
1033 return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
1034 &save_info, data);
1035 }
1036
1037 /* Restore delta lists from saved data. */
uds_finish_restoring_delta_index(struct delta_index * delta_index,struct buffered_reader ** buffered_readers,unsigned int reader_count)1038 int uds_finish_restoring_delta_index(struct delta_index *delta_index,
1039 struct buffered_reader **buffered_readers,
1040 unsigned int reader_count)
1041 {
1042 int result;
1043 int saved_result = UDS_SUCCESS;
1044 unsigned int z;
1045 u8 *data;
1046
1047 result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, __func__, &data);
1048 if (result != VDO_SUCCESS)
1049 return result;
1050
1051 for (z = 0; z < reader_count; z++) {
1052 while (delta_index->load_lists[z] > 0) {
1053 result = restore_delta_list_data(delta_index, z,
1054 buffered_readers[z], data);
1055 if (result != UDS_SUCCESS) {
1056 saved_result = result;
1057 break;
1058 }
1059 }
1060 }
1061
1062 vdo_free(data);
1063 return saved_result;
1064 }
1065
uds_check_guard_delta_lists(struct buffered_reader ** buffered_readers,unsigned int reader_count)1066 int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
1067 unsigned int reader_count)
1068 {
1069 int result;
1070 unsigned int z;
1071 u8 buffer[sizeof(struct delta_list_save_info)];
1072
1073 for (z = 0; z < reader_count; z++) {
1074 result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
1075 sizeof(buffer));
1076 if (result != UDS_SUCCESS)
1077 return result;
1078
1079 if (buffer[0] != 'z')
1080 return UDS_CORRUPT_DATA;
1081 }
1082
1083 return UDS_SUCCESS;
1084 }
1085
flush_delta_list(struct delta_zone * zone,u32 flush_index)1086 static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
1087 {
1088 struct delta_list *delta_list;
1089 u8 buffer[sizeof(struct delta_list_save_info)];
1090 int result;
1091
1092 delta_list = &zone->delta_lists[flush_index + 1];
1093
1094 buffer[0] = zone->tag;
1095 buffer[1] = delta_list->start % BITS_PER_BYTE;
1096 put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
1097 put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
1098
1099 result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
1100 sizeof(buffer));
1101 if (result != UDS_SUCCESS) {
1102 vdo_log_warning_strerror(result, "failed to write delta list memory");
1103 return result;
1104 }
1105
1106 result = uds_write_to_buffered_writer(zone->buffered_writer,
1107 zone->memory + get_delta_list_byte_start(delta_list),
1108 get_delta_list_byte_size(delta_list));
1109 if (result != UDS_SUCCESS)
1110 vdo_log_warning_strerror(result, "failed to write delta list memory");
1111
1112 return result;
1113 }
1114
1115 /* Start saving a delta index zone to a buffered output stream. */
uds_start_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number,struct buffered_writer * buffered_writer)1116 int uds_start_saving_delta_index(const struct delta_index *delta_index,
1117 unsigned int zone_number,
1118 struct buffered_writer *buffered_writer)
1119 {
1120 int result;
1121 u32 i;
1122 struct delta_zone *delta_zone;
1123 u8 buffer[sizeof(struct delta_index_header)];
1124 size_t offset = 0;
1125
1126 delta_zone = &delta_index->delta_zones[zone_number];
1127 memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
1128 offset += MAGIC_SIZE;
1129 encode_u32_le(buffer, &offset, zone_number);
1130 encode_u32_le(buffer, &offset, delta_index->zone_count);
1131 encode_u32_le(buffer, &offset, delta_zone->first_list);
1132 encode_u32_le(buffer, &offset, delta_zone->list_count);
1133 encode_u64_le(buffer, &offset, delta_zone->record_count);
1134 encode_u64_le(buffer, &offset, delta_zone->collision_count);
1135
1136 result = VDO_ASSERT(offset == sizeof(struct delta_index_header),
1137 "%zu bytes encoded of %zu expected", offset,
1138 sizeof(struct delta_index_header));
1139 if (result != VDO_SUCCESS)
1140 return result;
1141
1142 result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
1143 if (result != UDS_SUCCESS)
1144 return vdo_log_warning_strerror(result,
1145 "failed to write delta index header");
1146
1147 for (i = 0; i < delta_zone->list_count; i++) {
1148 u8 data[sizeof(u16)];
1149 struct delta_list *delta_list;
1150
1151 delta_list = &delta_zone->delta_lists[i + 1];
1152 put_unaligned_le16(delta_list->size, data);
1153 result = uds_write_to_buffered_writer(buffered_writer, data,
1154 sizeof(data));
1155 if (result != UDS_SUCCESS)
1156 return vdo_log_warning_strerror(result,
1157 "failed to write delta list size");
1158 }
1159
1160 delta_zone->buffered_writer = buffered_writer;
1161 return UDS_SUCCESS;
1162 }
1163
uds_finish_saving_delta_index(const struct delta_index * delta_index,unsigned int zone_number)1164 int uds_finish_saving_delta_index(const struct delta_index *delta_index,
1165 unsigned int zone_number)
1166 {
1167 int result;
1168 int first_error = UDS_SUCCESS;
1169 u32 i;
1170 struct delta_zone *delta_zone;
1171 struct delta_list *delta_list;
1172
1173 delta_zone = &delta_index->delta_zones[zone_number];
1174 for (i = 0; i < delta_zone->list_count; i++) {
1175 delta_list = &delta_zone->delta_lists[i + 1];
1176 if (delta_list->size > 0) {
1177 result = flush_delta_list(delta_zone, i);
1178 if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
1179 first_error = result;
1180 }
1181 }
1182
1183 delta_zone->buffered_writer = NULL;
1184 return first_error;
1185 }
1186
uds_write_guard_delta_list(struct buffered_writer * buffered_writer)1187 int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
1188 {
1189 int result;
1190 u8 buffer[sizeof(struct delta_list_save_info)];
1191
1192 memset(buffer, 0, sizeof(struct delta_list_save_info));
1193 buffer[0] = 'z';
1194
1195 result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
1196 if (result != UDS_SUCCESS)
1197 vdo_log_warning_strerror(result, "failed to write guard delta list");
1198
1199 return UDS_SUCCESS;
1200 }
1201
uds_compute_delta_index_save_bytes(u32 list_count,size_t memory_size)1202 size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
1203 {
1204 /* One zone will use at least as much memory as other zone counts. */
1205 return (sizeof(struct delta_index_header) +
1206 list_count * (sizeof(struct delta_list_save_info) + 1) +
1207 get_zone_memory_size(1, memory_size));
1208 }
1209
assert_not_at_end(const struct delta_index_entry * delta_entry)1210 static int assert_not_at_end(const struct delta_index_entry *delta_entry)
1211 {
1212 int result = VDO_ASSERT(!delta_entry->at_end,
1213 "operation is invalid because the list entry is at the end of the delta list");
1214 if (result != VDO_SUCCESS)
1215 result = UDS_BAD_STATE;
1216
1217 return result;
1218 }
1219
1220 /*
1221 * Prepare to search for an entry in the specified delta list.
1222 *
1223 * This is always the first function to be called when dealing with delta index entries. It is
1224 * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
1225 * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
1226 * entry from the list.
1227 */
uds_start_delta_index_search(const struct delta_index * delta_index,u32 list_number,u32 key,struct delta_index_entry * delta_entry)1228 int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
1229 u32 key, struct delta_index_entry *delta_entry)
1230 {
1231 int result;
1232 unsigned int zone_number;
1233 struct delta_zone *delta_zone;
1234 struct delta_list *delta_list;
1235
1236 result = VDO_ASSERT((list_number < delta_index->list_count),
1237 "Delta list number (%u) is out of range (%u)", list_number,
1238 delta_index->list_count);
1239 if (result != VDO_SUCCESS)
1240 return UDS_CORRUPT_DATA;
1241
1242 zone_number = list_number / delta_index->lists_per_zone;
1243 delta_zone = &delta_index->delta_zones[zone_number];
1244 list_number -= delta_zone->first_list;
1245 result = VDO_ASSERT((list_number < delta_zone->list_count),
1246 "Delta list number (%u) is out of range (%u) for zone (%u)",
1247 list_number, delta_zone->list_count, zone_number);
1248 if (result != VDO_SUCCESS)
1249 return UDS_CORRUPT_DATA;
1250
1251 if (delta_index->mutable) {
1252 delta_list = &delta_zone->delta_lists[list_number + 1];
1253 } else {
1254 u32 end_offset;
1255
1256 /*
1257 * Translate the immutable delta list header into a temporary
1258 * full delta list header.
1259 */
1260 delta_list = &delta_entry->temp_delta_list;
1261 delta_list->start = get_immutable_start(delta_zone->memory, list_number);
1262 end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
1263 delta_list->size = end_offset - delta_list->start;
1264 delta_list->save_key = 0;
1265 delta_list->save_offset = 0;
1266 }
1267
1268 if (key > delta_list->save_key) {
1269 delta_entry->key = delta_list->save_key;
1270 delta_entry->offset = delta_list->save_offset;
1271 } else {
1272 delta_entry->key = 0;
1273 delta_entry->offset = 0;
1274 if (key == 0) {
1275 /*
1276 * This usually means we're about to walk the entire delta list, so get all
1277 * of it into the CPU cache.
1278 */
1279 uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
1280 delta_list->size / BITS_PER_BYTE, false);
1281 }
1282 }
1283
1284 delta_entry->at_end = false;
1285 delta_entry->delta_zone = delta_zone;
1286 delta_entry->delta_list = delta_list;
1287 delta_entry->entry_bits = 0;
1288 delta_entry->is_collision = false;
1289 delta_entry->list_number = list_number;
1290 delta_entry->list_overflow = false;
1291 delta_entry->value_bits = delta_zone->value_bits;
1292 return UDS_SUCCESS;
1293 }
1294
get_delta_entry_offset(const struct delta_index_entry * delta_entry)1295 static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
1296 {
1297 return delta_entry->delta_list->start + delta_entry->offset;
1298 }
1299
1300 /*
1301 * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
1302 * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
1303 * bit stream and update the delta_list_entry to describe the entry.
1304 */
decode_delta(struct delta_index_entry * delta_entry)1305 static inline void decode_delta(struct delta_index_entry *delta_entry)
1306 {
1307 int key_bits;
1308 u32 delta;
1309 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1310 const u8 *memory = delta_zone->memory;
1311 u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1312 const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
1313 int offset = delta_offset % BITS_PER_BYTE;
1314 u32 data = get_unaligned_le32(addr) >> offset;
1315
1316 addr += sizeof(u32);
1317 key_bits = delta_zone->min_bits;
1318 delta = data & ((1 << key_bits) - 1);
1319 if (delta >= delta_zone->min_keys) {
1320 data >>= key_bits;
1321 if (data == 0) {
1322 key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
1323 while ((data = get_unaligned_le32(addr)) == 0) {
1324 addr += sizeof(u32);
1325 key_bits += sizeof(u32) * BITS_PER_BYTE;
1326 }
1327 }
1328 key_bits += ffs(data);
1329 delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
1330 }
1331 delta_entry->delta = delta;
1332 delta_entry->key += delta;
1333
1334 /* Check for a collision, a delta of zero after the start. */
1335 if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
1336 delta_entry->is_collision = true;
1337 delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
1338 } else {
1339 delta_entry->is_collision = false;
1340 delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1341 }
1342 }
1343
uds_next_delta_index_entry(struct delta_index_entry * delta_entry)1344 noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
1345 {
1346 int result;
1347 const struct delta_list *delta_list;
1348 u32 next_offset;
1349 u16 size;
1350
1351 result = assert_not_at_end(delta_entry);
1352 if (result != UDS_SUCCESS)
1353 return result;
1354
1355 delta_list = delta_entry->delta_list;
1356 delta_entry->offset += delta_entry->entry_bits;
1357 size = delta_list->size;
1358 if (unlikely(delta_entry->offset >= size)) {
1359 delta_entry->at_end = true;
1360 delta_entry->delta = 0;
1361 delta_entry->is_collision = false;
1362 result = VDO_ASSERT((delta_entry->offset == size),
1363 "next offset past end of delta list");
1364 if (result != VDO_SUCCESS)
1365 result = UDS_CORRUPT_DATA;
1366
1367 return result;
1368 }
1369
1370 decode_delta(delta_entry);
1371
1372 next_offset = delta_entry->offset + delta_entry->entry_bits;
1373 if (next_offset > size) {
1374 /*
1375 * This is not an assertion because uds_validate_chapter_index_page() wants to
1376 * handle this error.
1377 */
1378 vdo_log_warning("Decoded past the end of the delta list");
1379 return UDS_CORRUPT_DATA;
1380 }
1381
1382 return UDS_SUCCESS;
1383 }
1384
uds_remember_delta_index_offset(const struct delta_index_entry * delta_entry)1385 int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
1386 {
1387 int result;
1388 struct delta_list *delta_list = delta_entry->delta_list;
1389
1390 result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision");
1391 if (result != VDO_SUCCESS)
1392 return result;
1393
1394 delta_list->save_key = delta_entry->key - delta_entry->delta;
1395 delta_list->save_offset = delta_entry->offset;
1396 return UDS_SUCCESS;
1397 }
1398
set_delta(struct delta_index_entry * delta_entry,u32 delta)1399 static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
1400 {
1401 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1402 u32 key_bits = (delta_zone->min_bits +
1403 ((delta_zone->incr_keys - delta_zone->min_keys + delta) /
1404 delta_zone->incr_keys));
1405
1406 delta_entry->delta = delta;
1407 delta_entry->entry_bits = delta_entry->value_bits + key_bits;
1408 }
1409
get_collision_name(const struct delta_index_entry * entry,u8 * name)1410 static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
1411 {
1412 u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1413 const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1414 int size = COLLISION_BYTES;
1415 int shift = offset % BITS_PER_BYTE;
1416
1417 while (--size >= 0)
1418 *name++ = get_unaligned_le16(addr++) >> shift;
1419 }
1420
set_collision_name(const struct delta_index_entry * entry,const u8 * name)1421 static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
1422 {
1423 u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
1424 u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
1425 int size = COLLISION_BYTES;
1426 int shift = offset % BITS_PER_BYTE;
1427 u16 mask = ~((u16) 0xFF << shift);
1428 u16 data;
1429
1430 while (--size >= 0) {
1431 data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
1432 put_unaligned_le16(data, addr++);
1433 }
1434 }
1435
uds_get_delta_index_entry(const struct delta_index * delta_index,u32 list_number,u32 key,const u8 * name,struct delta_index_entry * delta_entry)1436 int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
1437 u32 key, const u8 *name,
1438 struct delta_index_entry *delta_entry)
1439 {
1440 int result;
1441
1442 result = uds_start_delta_index_search(delta_index, list_number, key,
1443 delta_entry);
1444 if (result != UDS_SUCCESS)
1445 return result;
1446
1447 do {
1448 result = uds_next_delta_index_entry(delta_entry);
1449 if (result != UDS_SUCCESS)
1450 return result;
1451 } while (!delta_entry->at_end && (key > delta_entry->key));
1452
1453 result = uds_remember_delta_index_offset(delta_entry);
1454 if (result != UDS_SUCCESS)
1455 return result;
1456
1457 if (!delta_entry->at_end && (key == delta_entry->key)) {
1458 struct delta_index_entry collision_entry = *delta_entry;
1459
1460 for (;;) {
1461 u8 full_name[COLLISION_BYTES];
1462
1463 result = uds_next_delta_index_entry(&collision_entry);
1464 if (result != UDS_SUCCESS)
1465 return result;
1466
1467 if (collision_entry.at_end || !collision_entry.is_collision)
1468 break;
1469
1470 get_collision_name(&collision_entry, full_name);
1471 if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
1472 *delta_entry = collision_entry;
1473 break;
1474 }
1475 }
1476 }
1477
1478 return UDS_SUCCESS;
1479 }
1480
uds_get_delta_entry_collision(const struct delta_index_entry * delta_entry,u8 * name)1481 int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
1482 {
1483 int result;
1484
1485 result = assert_not_at_end(delta_entry);
1486 if (result != UDS_SUCCESS)
1487 return result;
1488
1489 result = VDO_ASSERT(delta_entry->is_collision,
1490 "Cannot get full block name from a non-collision delta index entry");
1491 if (result != VDO_SUCCESS)
1492 return UDS_BAD_STATE;
1493
1494 get_collision_name(delta_entry, name);
1495 return UDS_SUCCESS;
1496 }
1497
uds_get_delta_entry_value(const struct delta_index_entry * delta_entry)1498 u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
1499 {
1500 return get_field(delta_entry->delta_zone->memory,
1501 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1502 }
1503
assert_mutable_entry(const struct delta_index_entry * delta_entry)1504 static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
1505 {
1506 int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
1507 "delta index is mutable");
1508 if (result != VDO_SUCCESS)
1509 result = UDS_BAD_STATE;
1510
1511 return result;
1512 }
1513
uds_set_delta_entry_value(const struct delta_index_entry * delta_entry,u32 value)1514 int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
1515 {
1516 int result;
1517 u32 value_mask = (1 << delta_entry->value_bits) - 1;
1518
1519 result = assert_mutable_entry(delta_entry);
1520 if (result != UDS_SUCCESS)
1521 return result;
1522
1523 result = assert_not_at_end(delta_entry);
1524 if (result != UDS_SUCCESS)
1525 return result;
1526
1527 result = VDO_ASSERT((value & value_mask) == value,
1528 "Value (%u) being set in a delta index is too large (must fit in %u bits)",
1529 value, delta_entry->value_bits);
1530 if (result != VDO_SUCCESS)
1531 return UDS_INVALID_ARGUMENT;
1532
1533 set_field(value, delta_entry->delta_zone->memory,
1534 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
1535 return UDS_SUCCESS;
1536 }
1537
1538 /*
1539 * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
1540 * by growing_index, then rebalancing the lists in the new chunk.
1541 */
extend_delta_zone(struct delta_zone * delta_zone,u32 growing_index,size_t growing_size)1542 static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
1543 size_t growing_size)
1544 {
1545 ktime_t start_time;
1546 ktime_t end_time;
1547 struct delta_list *delta_lists;
1548 u32 i;
1549 size_t used_space;
1550
1551
1552 /* Calculate the amount of space that is or will be in use. */
1553 start_time = current_time_ns(CLOCK_MONOTONIC);
1554 delta_lists = delta_zone->delta_lists;
1555 used_space = growing_size;
1556 for (i = 0; i <= delta_zone->list_count + 1; i++)
1557 used_space += get_delta_list_byte_size(&delta_lists[i]);
1558
1559 if (delta_zone->size < used_space)
1560 return UDS_OVERFLOW;
1561
1562 /* Compute the new offsets of the delta lists. */
1563 compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
1564
1565 /*
1566 * When we rebalance the delta list, we will include the end guard list in the rebalancing.
1567 * It contains the end guard data, which must be copied.
1568 */
1569 rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
1570 end_time = current_time_ns(CLOCK_MONOTONIC);
1571 delta_zone->rebalance_count++;
1572 delta_zone->rebalance_time += ktime_sub(end_time, start_time);
1573 return UDS_SUCCESS;
1574 }
1575
insert_bits(struct delta_index_entry * delta_entry,u16 size)1576 static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
1577 {
1578 u64 free_before;
1579 u64 free_after;
1580 u64 source;
1581 u64 destination;
1582 u32 count;
1583 bool before_flag;
1584 u8 *memory;
1585 struct delta_zone *delta_zone = delta_entry->delta_zone;
1586 struct delta_list *delta_list = delta_entry->delta_list;
1587 /* Compute bits in use before and after the inserted bits. */
1588 u32 total_size = delta_list->size;
1589 u32 before_size = delta_entry->offset;
1590 u32 after_size = total_size - delta_entry->offset;
1591
1592 if (total_size + size > U16_MAX) {
1593 delta_entry->list_overflow = true;
1594 delta_zone->overflow_count++;
1595 return UDS_OVERFLOW;
1596 }
1597
1598 /* Compute bits available before and after the delta list. */
1599 free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1600 free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1601
1602 if ((size <= free_before) && (size <= free_after)) {
1603 /*
1604 * We have enough space to use either before or after the list. Select the smaller
1605 * amount of data. If it is exactly the same, try to take from the larger amount of
1606 * free space.
1607 */
1608 if (before_size < after_size)
1609 before_flag = true;
1610 else if (after_size < before_size)
1611 before_flag = false;
1612 else
1613 before_flag = free_before > free_after;
1614 } else if (size <= free_before) {
1615 /* There is space before but not after. */
1616 before_flag = true;
1617 } else if (size <= free_after) {
1618 /* There is space after but not before. */
1619 before_flag = false;
1620 } else {
1621 /*
1622 * Neither of the surrounding spaces is large enough for this request. Extend
1623 * and/or rebalance the delta list memory choosing to move the least amount of
1624 * data.
1625 */
1626 int result;
1627 u32 growing_index = delta_entry->list_number + 1;
1628
1629 before_flag = before_size < after_size;
1630 if (!before_flag)
1631 growing_index++;
1632 result = extend_delta_zone(delta_zone, growing_index,
1633 BITS_TO_BYTES(size));
1634 if (result != UDS_SUCCESS)
1635 return result;
1636 }
1637
1638 delta_list->size += size;
1639 if (before_flag) {
1640 source = delta_list->start;
1641 destination = source - size;
1642 delta_list->start -= size;
1643 count = before_size;
1644 } else {
1645 source = delta_list->start + delta_entry->offset;
1646 destination = source + size;
1647 count = after_size;
1648 }
1649
1650 memory = delta_zone->memory;
1651 move_bits(memory, source, memory, destination, count);
1652 return UDS_SUCCESS;
1653 }
1654
encode_delta(const struct delta_index_entry * delta_entry)1655 static void encode_delta(const struct delta_index_entry *delta_entry)
1656 {
1657 u32 temp;
1658 u32 t1;
1659 u32 t2;
1660 u64 offset;
1661 const struct delta_zone *delta_zone = delta_entry->delta_zone;
1662 u8 *memory = delta_zone->memory;
1663
1664 offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
1665 if (delta_entry->delta < delta_zone->min_keys) {
1666 set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
1667 return;
1668 }
1669
1670 temp = delta_entry->delta - delta_zone->min_keys;
1671 t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
1672 t2 = temp / delta_zone->incr_keys;
1673 set_field(t1, memory, offset, delta_zone->min_bits);
1674 set_zero(memory, offset + delta_zone->min_bits, t2);
1675 set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
1676 }
1677
encode_entry(const struct delta_index_entry * delta_entry,u32 value,const u8 * name)1678 static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
1679 const u8 *name)
1680 {
1681 u8 *memory = delta_entry->delta_zone->memory;
1682 u64 offset = get_delta_entry_offset(delta_entry);
1683
1684 set_field(value, memory, offset, delta_entry->value_bits);
1685 encode_delta(delta_entry);
1686 if (name != NULL)
1687 set_collision_name(delta_entry, name);
1688 }
1689
1690 /*
1691 * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
1692 * be provided.
1693 */
uds_put_delta_index_entry(struct delta_index_entry * delta_entry,u32 key,u32 value,const u8 * name)1694 int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
1695 const u8 *name)
1696 {
1697 int result;
1698 struct delta_zone *delta_zone;
1699
1700 result = assert_mutable_entry(delta_entry);
1701 if (result != UDS_SUCCESS)
1702 return result;
1703
1704 if (delta_entry->is_collision) {
1705 /*
1706 * The caller wants us to insert a collision entry onto a collision entry. This
1707 * happens when we find a collision and attempt to add the name again to the index.
1708 * This is normally a fatal error unless we are replaying a closed chapter while we
1709 * are rebuilding a volume index.
1710 */
1711 return UDS_DUPLICATE_NAME;
1712 }
1713
1714 if (delta_entry->offset < delta_entry->delta_list->save_offset) {
1715 /*
1716 * The saved entry offset is after the new entry and will no longer be valid, so
1717 * replace it with the insertion point.
1718 */
1719 result = uds_remember_delta_index_offset(delta_entry);
1720 if (result != UDS_SUCCESS)
1721 return result;
1722 }
1723
1724 if (name != NULL) {
1725 /* Insert a collision entry which is placed after this entry. */
1726 result = assert_not_at_end(delta_entry);
1727 if (result != UDS_SUCCESS)
1728 return result;
1729
1730 result = VDO_ASSERT((key == delta_entry->key),
1731 "incorrect key for collision entry");
1732 if (result != VDO_SUCCESS)
1733 return result;
1734
1735 delta_entry->offset += delta_entry->entry_bits;
1736 set_delta(delta_entry, 0);
1737 delta_entry->is_collision = true;
1738 delta_entry->entry_bits += COLLISION_BITS;
1739 result = insert_bits(delta_entry, delta_entry->entry_bits);
1740 } else if (delta_entry->at_end) {
1741 /* Insert a new entry at the end of the delta list. */
1742 result = VDO_ASSERT((key >= delta_entry->key), "key past end of list");
1743 if (result != VDO_SUCCESS)
1744 return result;
1745
1746 set_delta(delta_entry, key - delta_entry->key);
1747 delta_entry->key = key;
1748 delta_entry->at_end = false;
1749 result = insert_bits(delta_entry, delta_entry->entry_bits);
1750 } else {
1751 u16 old_entry_size;
1752 u16 additional_size;
1753 struct delta_index_entry next_entry;
1754 u32 next_value;
1755
1756 /*
1757 * Insert a new entry which requires the delta in the following entry to be
1758 * updated.
1759 */
1760 result = VDO_ASSERT((key < delta_entry->key),
1761 "key precedes following entry");
1762 if (result != VDO_SUCCESS)
1763 return result;
1764
1765 result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta),
1766 "key effects following entry's delta");
1767 if (result != VDO_SUCCESS)
1768 return result;
1769
1770 old_entry_size = delta_entry->entry_bits;
1771 next_entry = *delta_entry;
1772 next_value = uds_get_delta_entry_value(&next_entry);
1773 set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
1774 delta_entry->key = key;
1775 set_delta(&next_entry, next_entry.key - key);
1776 next_entry.offset += delta_entry->entry_bits;
1777 /* The two new entries are always bigger than the single entry being replaced. */
1778 additional_size = (delta_entry->entry_bits +
1779 next_entry.entry_bits - old_entry_size);
1780 result = insert_bits(delta_entry, additional_size);
1781 if (result != UDS_SUCCESS)
1782 return result;
1783
1784 encode_entry(&next_entry, next_value, NULL);
1785 }
1786
1787 if (result != UDS_SUCCESS)
1788 return result;
1789
1790 encode_entry(delta_entry, value, name);
1791 delta_zone = delta_entry->delta_zone;
1792 delta_zone->record_count++;
1793 delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
1794 return UDS_SUCCESS;
1795 }
1796
delete_bits(const struct delta_index_entry * delta_entry,int size)1797 static void delete_bits(const struct delta_index_entry *delta_entry, int size)
1798 {
1799 u64 source;
1800 u64 destination;
1801 u32 count;
1802 bool before_flag;
1803 struct delta_list *delta_list = delta_entry->delta_list;
1804 u8 *memory = delta_entry->delta_zone->memory;
1805 /* Compute bits retained before and after the deleted bits. */
1806 u32 total_size = delta_list->size;
1807 u32 before_size = delta_entry->offset;
1808 u32 after_size = total_size - delta_entry->offset - size;
1809
1810 /*
1811 * Determine whether to add to the available space either before or after the delta list.
1812 * We prefer to move the least amount of data. If it is exactly the same, try to add to the
1813 * smaller amount of free space.
1814 */
1815 if (before_size < after_size) {
1816 before_flag = true;
1817 } else if (after_size < before_size) {
1818 before_flag = false;
1819 } else {
1820 u64 free_before =
1821 (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
1822 u64 free_after =
1823 (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
1824
1825 before_flag = (free_before < free_after);
1826 }
1827
1828 delta_list->size -= size;
1829 if (before_flag) {
1830 source = delta_list->start;
1831 destination = source + size;
1832 delta_list->start += size;
1833 count = before_size;
1834 } else {
1835 destination = delta_list->start + delta_entry->offset;
1836 source = destination + size;
1837 count = after_size;
1838 }
1839
1840 move_bits(memory, source, memory, destination, count);
1841 }
1842
uds_remove_delta_index_entry(struct delta_index_entry * delta_entry)1843 int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
1844 {
1845 int result;
1846 struct delta_index_entry next_entry;
1847 struct delta_zone *delta_zone;
1848 struct delta_list *delta_list;
1849
1850 result = assert_mutable_entry(delta_entry);
1851 if (result != UDS_SUCCESS)
1852 return result;
1853
1854 next_entry = *delta_entry;
1855 result = uds_next_delta_index_entry(&next_entry);
1856 if (result != UDS_SUCCESS)
1857 return result;
1858
1859 delta_zone = delta_entry->delta_zone;
1860
1861 if (delta_entry->is_collision) {
1862 /* This is a collision entry, so just remove it. */
1863 delete_bits(delta_entry, delta_entry->entry_bits);
1864 next_entry.offset = delta_entry->offset;
1865 delta_zone->collision_count -= 1;
1866 } else if (next_entry.at_end) {
1867 /* This entry is at the end of the list, so just remove it. */
1868 delete_bits(delta_entry, delta_entry->entry_bits);
1869 next_entry.key -= delta_entry->delta;
1870 next_entry.offset = delta_entry->offset;
1871 } else {
1872 /* The delta in the next entry needs to be updated. */
1873 u32 next_value = uds_get_delta_entry_value(&next_entry);
1874 u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
1875
1876 if (next_entry.is_collision) {
1877 next_entry.is_collision = false;
1878 delta_zone->collision_count -= 1;
1879 }
1880
1881 set_delta(&next_entry, delta_entry->delta + next_entry.delta);
1882 next_entry.offset = delta_entry->offset;
1883 /* The one new entry is always smaller than the two entries being replaced. */
1884 delete_bits(delta_entry, old_size - next_entry.entry_bits);
1885 encode_entry(&next_entry, next_value, NULL);
1886 }
1887
1888 delta_zone->record_count--;
1889 delta_zone->discard_count++;
1890 *delta_entry = next_entry;
1891
1892 delta_list = delta_entry->delta_list;
1893 if (delta_entry->offset < delta_list->save_offset) {
1894 /* The saved entry offset is no longer valid. */
1895 delta_list->save_key = 0;
1896 delta_list->save_offset = 0;
1897 }
1898
1899 return UDS_SUCCESS;
1900 }
1901
uds_get_delta_index_stats(const struct delta_index * delta_index,struct delta_index_stats * stats)1902 void uds_get_delta_index_stats(const struct delta_index *delta_index,
1903 struct delta_index_stats *stats)
1904 {
1905 unsigned int z;
1906 const struct delta_zone *delta_zone;
1907
1908 memset(stats, 0, sizeof(struct delta_index_stats));
1909 for (z = 0; z < delta_index->zone_count; z++) {
1910 delta_zone = &delta_index->delta_zones[z];
1911 stats->rebalance_time += delta_zone->rebalance_time;
1912 stats->rebalance_count += delta_zone->rebalance_count;
1913 stats->record_count += delta_zone->record_count;
1914 stats->collision_count += delta_zone->collision_count;
1915 stats->discard_count += delta_zone->discard_count;
1916 stats->overflow_count += delta_zone->overflow_count;
1917 stats->list_count += delta_zone->list_count;
1918 }
1919 }
1920
uds_compute_delta_index_size(u32 entry_count,u32 mean_delta,u32 payload_bits)1921 size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
1922 {
1923 u16 min_bits;
1924 u32 incr_keys;
1925 u32 min_keys;
1926
1927 compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
1928 /* On average, each delta is encoded into about min_bits + 1.5 bits. */
1929 return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
1930 }
1931
uds_get_delta_index_page_count(u32 entry_count,u32 list_count,u32 mean_delta,u32 payload_bits,size_t bytes_per_page)1932 u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
1933 u32 payload_bits, size_t bytes_per_page)
1934 {
1935 unsigned int bits_per_delta_list;
1936 unsigned int bits_per_page;
1937 size_t bits_per_index;
1938
1939 /* Compute the expected number of bits needed for all the entries. */
1940 bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
1941 payload_bits);
1942 bits_per_delta_list = bits_per_index / list_count;
1943
1944 /* Add in the immutable delta list headers. */
1945 bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
1946 /* Compute the number of usable bits on an immutable index page. */
1947 bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
1948 /*
1949 * Reduce the bits per page by one immutable delta list header and one delta list to
1950 * account for internal fragmentation.
1951 */
1952 bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
1953 /* Now compute the number of pages needed. */
1954 return DIV_ROUND_UP(bits_per_index, bits_per_page);
1955 }
1956
uds_log_delta_index_entry(struct delta_index_entry * delta_entry)1957 void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
1958 {
1959 vdo_log_ratelimit(vdo_log_info,
1960 "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
1961 delta_entry->list_number, delta_entry->key,
1962 delta_entry->offset, delta_entry->at_end ? " end" : "",
1963 delta_entry->is_collision ? " collision" : "",
1964 delta_entry->delta_list->size,
1965 delta_entry->list_overflow ? " overflow" : "");
1966 delta_entry->list_overflow = false;
1967 }
1968