1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 *
5 * Code for managing the extent btree and dynamically updating the writeback
6 * dirty sector count.
7 */
8
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_cache.h"
12 #include "btree_gc.h"
13 #include "btree_io.h"
14 #include "btree_iter.h"
15 #include "buckets.h"
16 #include "checksum.h"
17 #include "compress.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "inode.h"
23 #include "journal.h"
24 #include "rebalance.h"
25 #include "replicas.h"
26 #include "super.h"
27 #include "super-io.h"
28 #include "trace.h"
29 #include "util.h"
30
31 static const char * const bch2_extent_flags_strs[] = {
32 #define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
33 BCH_EXTENT_FLAGS()
34 #undef x
35 NULL,
36 };
37
38 static unsigned bch2_crc_field_size_max[] = {
39 [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
40 [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
41 [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
42 };
43
44 static void bch2_extent_crc_pack(union bch_extent_crc *,
45 struct bch_extent_crc_unpacked,
46 enum bch_extent_entry_type);
47
bch2_dev_io_failures(struct bch_io_failures * f,unsigned dev)48 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
49 unsigned dev)
50 {
51 struct bch_dev_io_failures *i;
52
53 for (i = f->devs; i < f->devs + f->nr; i++)
54 if (i->dev == dev)
55 return i;
56
57 return NULL;
58 }
59
bch2_mark_io_failure(struct bch_io_failures * failed,struct extent_ptr_decoded * p,bool csum_error)60 void bch2_mark_io_failure(struct bch_io_failures *failed,
61 struct extent_ptr_decoded *p,
62 bool csum_error)
63 {
64 struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
65
66 if (!f) {
67 BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
68
69 f = &failed->devs[failed->nr++];
70 memset(f, 0, sizeof(*f));
71 f->dev = p->ptr.dev;
72 }
73
74 if (p->do_ec_reconstruct)
75 f->failed_ec = true;
76 else if (!csum_error)
77 f->failed_io = true;
78 else
79 f->failed_csum_nr++;
80 }
81
dev_latency(struct bch_dev * ca)82 static inline u64 dev_latency(struct bch_dev *ca)
83 {
84 return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
85 }
86
dev_failed(struct bch_dev * ca)87 static inline int dev_failed(struct bch_dev *ca)
88 {
89 return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
90 }
91
92 /*
93 * returns true if p1 is better than p2:
94 */
ptr_better(struct bch_fs * c,const struct extent_ptr_decoded p1,u64 p1_latency,struct bch_dev * ca1,const struct extent_ptr_decoded p2,u64 p2_latency)95 static inline bool ptr_better(struct bch_fs *c,
96 const struct extent_ptr_decoded p1,
97 u64 p1_latency,
98 struct bch_dev *ca1,
99 const struct extent_ptr_decoded p2,
100 u64 p2_latency)
101 {
102 struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
103
104 int failed_delta = dev_failed(ca1) - dev_failed(ca2);
105 if (unlikely(failed_delta))
106 return failed_delta < 0;
107
108 if (unlikely(bch2_force_reconstruct_read))
109 return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
110
111 if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
112 return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
113
114 int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
115 if (unlikely(crc_retry_delta))
116 return crc_retry_delta < 0;
117
118 /* Pick at random, biased in favor of the faster device: */
119
120 return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
121 }
122
123 /*
124 * This picks a non-stale pointer, preferably from a device other than @avoid.
125 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
126 * other devices, it will still pick a pointer from avoid.
127 */
bch2_bkey_pick_read_device(struct bch_fs * c,struct bkey_s_c k,struct bch_io_failures * failed,struct extent_ptr_decoded * pick,int dev)128 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
129 struct bch_io_failures *failed,
130 struct extent_ptr_decoded *pick,
131 int dev)
132 {
133 bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
134 bool have_dirty_ptrs = false, have_pick = false;
135
136 if (k.k->type == KEY_TYPE_error)
137 return -BCH_ERR_key_type_error;
138
139 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
140
141 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
142 return -BCH_ERR_extent_poisoned;
143
144 rcu_read_lock();
145 const union bch_extent_entry *entry;
146 struct extent_ptr_decoded p;
147 u64 pick_latency;
148
149 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
150 have_dirty_ptrs |= !p.ptr.cached;
151
152 /*
153 * Unwritten extent: no need to actually read, treat it as a
154 * hole and return 0s:
155 */
156 if (p.ptr.unwritten) {
157 rcu_read_unlock();
158 return 0;
159 }
160
161 /* Are we being asked to read from a specific device? */
162 if (dev >= 0 && p.ptr.dev != dev)
163 continue;
164
165 struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
166
167 if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
168 continue;
169
170 struct bch_dev_io_failures *f =
171 unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
172 if (unlikely(f)) {
173 p.crc_retry_nr = f->failed_csum_nr;
174 p.has_ec &= ~f->failed_ec;
175
176 if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
177 have_io_errors |= f->failed_io;
178 have_io_errors |= f->failed_ec;
179 }
180 have_csum_errors |= !!f->failed_csum_nr;
181
182 if (p.has_ec && (f->failed_io || f->failed_csum_nr))
183 p.do_ec_reconstruct = true;
184 else if (f->failed_io ||
185 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
186 continue;
187 }
188
189 have_missing_devs |= ca && !bch2_dev_is_online(ca);
190
191 if (!ca || !bch2_dev_is_online(ca)) {
192 if (!p.has_ec)
193 continue;
194 p.do_ec_reconstruct = true;
195 }
196
197 if (bch2_force_reconstruct_read && p.has_ec)
198 p.do_ec_reconstruct = true;
199
200 u64 p_latency = dev_latency(ca);
201 /*
202 * Square the latencies, to bias more in favor of the faster
203 * device - we never want to stop issuing reads to the slower
204 * device altogether, so that we can update our latency numbers:
205 */
206 p_latency *= p_latency;
207
208 if (!have_pick ||
209 ptr_better(c,
210 p, p_latency, ca,
211 *pick, pick_latency)) {
212 *pick = p;
213 pick_latency = p_latency;
214 have_pick = true;
215 }
216 }
217 rcu_read_unlock();
218
219 if (have_pick)
220 return 1;
221 if (!have_dirty_ptrs)
222 return 0;
223 if (have_missing_devs)
224 return -BCH_ERR_no_device_to_read_from;
225 if (have_csum_errors)
226 return -BCH_ERR_data_read_csum_err;
227 if (have_io_errors)
228 return -BCH_ERR_data_read_io_err;
229
230 /*
231 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
232 * but they don't point to valid devices:
233 */
234 return -BCH_ERR_no_devices_valid;
235 }
236
237 /* KEY_TYPE_btree_ptr: */
238
bch2_btree_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)239 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
240 struct bkey_validate_context from)
241 {
242 int ret = 0;
243
244 bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
245 c, btree_ptr_val_too_big,
246 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
247
248 ret = bch2_bkey_ptrs_validate(c, k, from);
249 fsck_err:
250 return ret;
251 }
252
bch2_btree_ptr_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)253 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
254 struct bkey_s_c k)
255 {
256 bch2_bkey_ptrs_to_text(out, c, k);
257 }
258
bch2_btree_ptr_v2_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)259 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
260 struct bkey_validate_context from)
261 {
262 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
263 int ret = 0;
264
265 bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
266 c, btree_ptr_v2_val_too_big,
267 "value too big (%zu > %zu)",
268 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
269
270 bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
271 c, btree_ptr_v2_min_key_bad,
272 "min_key > key");
273
274 if ((from.flags & BCH_VALIDATE_write) &&
275 c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
276 bkey_fsck_err_on(!bp.v->sectors_written,
277 c, btree_ptr_v2_written_0,
278 "sectors_written == 0");
279
280 ret = bch2_bkey_ptrs_validate(c, k, from);
281 fsck_err:
282 return ret;
283 }
284
bch2_btree_ptr_v2_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)285 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
286 struct bkey_s_c k)
287 {
288 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
289
290 prt_printf(out, "seq %llx written %u min_key %s",
291 le64_to_cpu(bp.v->seq),
292 le16_to_cpu(bp.v->sectors_written),
293 BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
294
295 bch2_bpos_to_text(out, bp.v->min_key);
296 prt_printf(out, " ");
297 bch2_bkey_ptrs_to_text(out, c, k);
298 }
299
bch2_btree_ptr_v2_compat(enum btree_id btree_id,unsigned version,unsigned big_endian,int write,struct bkey_s k)300 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
301 unsigned big_endian, int write,
302 struct bkey_s k)
303 {
304 struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
305
306 compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
307
308 if (version < bcachefs_metadata_version_inode_btree_change &&
309 btree_id_is_extents(btree_id) &&
310 !bkey_eq(bp.v->min_key, POS_MIN))
311 bp.v->min_key = write
312 ? bpos_nosnap_predecessor(bp.v->min_key)
313 : bpos_nosnap_successor(bp.v->min_key);
314 }
315
316 /* KEY_TYPE_extent: */
317
bch2_extent_merge(struct bch_fs * c,struct bkey_s l,struct bkey_s_c r)318 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
319 {
320 struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l);
321 struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
322 union bch_extent_entry *en_l;
323 const union bch_extent_entry *en_r;
324 struct extent_ptr_decoded lp, rp;
325 bool use_right_ptr;
326
327 en_l = l_ptrs.start;
328 en_r = r_ptrs.start;
329 while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
330 if (extent_entry_type(en_l) != extent_entry_type(en_r))
331 return false;
332
333 en_l = extent_entry_next(en_l);
334 en_r = extent_entry_next(en_r);
335 }
336
337 if (en_l < l_ptrs.end || en_r < r_ptrs.end)
338 return false;
339
340 en_l = l_ptrs.start;
341 en_r = r_ptrs.start;
342 lp.crc = bch2_extent_crc_unpack(l.k, NULL);
343 rp.crc = bch2_extent_crc_unpack(r.k, NULL);
344
345 while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
346 __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
347 if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
348 rp.ptr.offset + rp.crc.offset ||
349 lp.ptr.dev != rp.ptr.dev ||
350 lp.ptr.gen != rp.ptr.gen ||
351 lp.ptr.unwritten != rp.ptr.unwritten ||
352 lp.has_ec != rp.has_ec)
353 return false;
354
355 /* Extents may not straddle buckets: */
356 rcu_read_lock();
357 struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
358 bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
359 rcu_read_unlock();
360
361 if (!same_bucket)
362 return false;
363
364 if (lp.has_ec != rp.has_ec ||
365 (lp.has_ec &&
366 (lp.ec.block != rp.ec.block ||
367 lp.ec.redundancy != rp.ec.redundancy ||
368 lp.ec.idx != rp.ec.idx)))
369 return false;
370
371 if (lp.crc.compression_type != rp.crc.compression_type ||
372 lp.crc.nonce != rp.crc.nonce)
373 return false;
374
375 if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
376 lp.crc.uncompressed_size) {
377 /* can use left extent's crc entry */
378 } else if (lp.crc.live_size <= rp.crc.offset) {
379 /* can use right extent's crc entry */
380 } else {
381 /* check if checksums can be merged: */
382 if (lp.crc.csum_type != rp.crc.csum_type ||
383 lp.crc.nonce != rp.crc.nonce ||
384 crc_is_compressed(lp.crc) ||
385 !bch2_checksum_mergeable(lp.crc.csum_type))
386 return false;
387
388 if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
389 rp.crc.offset)
390 return false;
391
392 if (lp.crc.csum_type &&
393 lp.crc.uncompressed_size +
394 rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
395 return false;
396 }
397
398 en_l = extent_entry_next(en_l);
399 en_r = extent_entry_next(en_r);
400 }
401
402 en_l = l_ptrs.start;
403 en_r = r_ptrs.start;
404 while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
405 if (extent_entry_is_crc(en_l)) {
406 struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
407 struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
408
409 if (crc_l.uncompressed_size + crc_r.uncompressed_size >
410 bch2_crc_field_size_max[extent_entry_type(en_l)])
411 return false;
412 }
413
414 en_l = extent_entry_next(en_l);
415 en_r = extent_entry_next(en_r);
416 }
417
418 use_right_ptr = false;
419 en_l = l_ptrs.start;
420 en_r = r_ptrs.start;
421 while (en_l < l_ptrs.end) {
422 if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
423 use_right_ptr)
424 en_l->ptr = en_r->ptr;
425
426 if (extent_entry_is_crc(en_l)) {
427 struct bch_extent_crc_unpacked crc_l =
428 bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
429 struct bch_extent_crc_unpacked crc_r =
430 bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
431
432 use_right_ptr = false;
433
434 if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
435 crc_l.uncompressed_size) {
436 /* can use left extent's crc entry */
437 } else if (crc_l.live_size <= crc_r.offset) {
438 /* can use right extent's crc entry */
439 crc_r.offset -= crc_l.live_size;
440 bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
441 extent_entry_type(en_l));
442 use_right_ptr = true;
443 } else {
444 crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
445 crc_l.csum,
446 crc_r.csum,
447 crc_r.uncompressed_size << 9);
448
449 crc_l.uncompressed_size += crc_r.uncompressed_size;
450 crc_l.compressed_size += crc_r.compressed_size;
451 bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
452 extent_entry_type(en_l));
453 }
454 }
455
456 en_l = extent_entry_next(en_l);
457 en_r = extent_entry_next(en_r);
458 }
459
460 bch2_key_resize(l.k, l.k->size + r.k->size);
461 return true;
462 }
463
464 /* KEY_TYPE_reservation: */
465
bch2_reservation_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)466 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
467 struct bkey_validate_context from)
468 {
469 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
470 int ret = 0;
471
472 bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
473 c, reservation_key_nr_replicas_invalid,
474 "invalid nr_replicas (%u)", r.v->nr_replicas);
475 fsck_err:
476 return ret;
477 }
478
bch2_reservation_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)479 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
480 struct bkey_s_c k)
481 {
482 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
483
484 prt_printf(out, "generation %u replicas %u",
485 le32_to_cpu(r.v->generation),
486 r.v->nr_replicas);
487 }
488
bch2_reservation_merge(struct bch_fs * c,struct bkey_s _l,struct bkey_s_c _r)489 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
490 {
491 struct bkey_s_reservation l = bkey_s_to_reservation(_l);
492 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
493
494 if (l.v->generation != r.v->generation ||
495 l.v->nr_replicas != r.v->nr_replicas)
496 return false;
497
498 bch2_key_resize(l.k, l.k->size + r.k->size);
499 return true;
500 }
501
502 /* Extent checksum entries: */
503
504 /* returns true if not equal */
bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,struct bch_extent_crc_unpacked r)505 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
506 struct bch_extent_crc_unpacked r)
507 {
508 return (l.csum_type != r.csum_type ||
509 l.compression_type != r.compression_type ||
510 l.compressed_size != r.compressed_size ||
511 l.uncompressed_size != r.uncompressed_size ||
512 l.offset != r.offset ||
513 l.live_size != r.live_size ||
514 l.nonce != r.nonce ||
515 bch2_crc_cmp(l.csum, r.csum));
516 }
517
can_narrow_crc(struct bch_extent_crc_unpacked u,struct bch_extent_crc_unpacked n)518 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
519 struct bch_extent_crc_unpacked n)
520 {
521 return !crc_is_compressed(u) &&
522 u.csum_type &&
523 u.uncompressed_size > u.live_size &&
524 bch2_csum_type_is_encryption(u.csum_type) ==
525 bch2_csum_type_is_encryption(n.csum_type);
526 }
527
bch2_can_narrow_extent_crcs(struct bkey_s_c k,struct bch_extent_crc_unpacked n)528 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
529 struct bch_extent_crc_unpacked n)
530 {
531 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
532 struct bch_extent_crc_unpacked crc;
533 const union bch_extent_entry *i;
534
535 if (!n.csum_type)
536 return false;
537
538 bkey_for_each_crc(k.k, ptrs, crc, i)
539 if (can_narrow_crc(crc, n))
540 return true;
541
542 return false;
543 }
544
545 /*
546 * We're writing another replica for this extent, so while we've got the data in
547 * memory we'll be computing a new checksum for the currently live data.
548 *
549 * If there are other replicas we aren't moving, and they are checksummed but
550 * not compressed, we can modify them to point to only the data that is
551 * currently live (so that readers won't have to bounce) while we've got the
552 * checksum we need:
553 */
bch2_bkey_narrow_crcs(struct bkey_i * k,struct bch_extent_crc_unpacked n)554 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
555 {
556 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
557 struct bch_extent_crc_unpacked u;
558 struct extent_ptr_decoded p;
559 union bch_extent_entry *i;
560 bool ret = false;
561
562 /* Find a checksum entry that covers only live data: */
563 if (!n.csum_type) {
564 bkey_for_each_crc(&k->k, ptrs, u, i)
565 if (!crc_is_compressed(u) &&
566 u.csum_type &&
567 u.live_size == u.uncompressed_size) {
568 n = u;
569 goto found;
570 }
571 return false;
572 }
573 found:
574 BUG_ON(crc_is_compressed(n));
575 BUG_ON(n.offset);
576 BUG_ON(n.live_size != k->k.size);
577
578 restart_narrow_pointers:
579 ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
580
581 bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
582 if (can_narrow_crc(p.crc, n)) {
583 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
584 p.ptr.offset += p.crc.offset;
585 p.crc = n;
586 bch2_extent_ptr_decoded_append(k, &p);
587 ret = true;
588 goto restart_narrow_pointers;
589 }
590
591 return ret;
592 }
593
bch2_extent_crc_pack(union bch_extent_crc * dst,struct bch_extent_crc_unpacked src,enum bch_extent_entry_type type)594 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
595 struct bch_extent_crc_unpacked src,
596 enum bch_extent_entry_type type)
597 {
598 #define common_fields(_src) \
599 .type = BIT(type), \
600 .csum_type = _src.csum_type, \
601 .compression_type = _src.compression_type, \
602 ._compressed_size = _src.compressed_size - 1, \
603 ._uncompressed_size = _src.uncompressed_size - 1, \
604 .offset = _src.offset
605
606 switch (type) {
607 case BCH_EXTENT_ENTRY_crc32:
608 dst->crc32 = (struct bch_extent_crc32) {
609 common_fields(src),
610 .csum = (u32 __force) *((__le32 *) &src.csum.lo),
611 };
612 break;
613 case BCH_EXTENT_ENTRY_crc64:
614 dst->crc64 = (struct bch_extent_crc64) {
615 common_fields(src),
616 .nonce = src.nonce,
617 .csum_lo = (u64 __force) src.csum.lo,
618 .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
619 };
620 break;
621 case BCH_EXTENT_ENTRY_crc128:
622 dst->crc128 = (struct bch_extent_crc128) {
623 common_fields(src),
624 .nonce = src.nonce,
625 .csum = src.csum,
626 };
627 break;
628 default:
629 BUG();
630 }
631 #undef set_common_fields
632 }
633
bch2_extent_crc_append(struct bkey_i * k,struct bch_extent_crc_unpacked new)634 void bch2_extent_crc_append(struct bkey_i *k,
635 struct bch_extent_crc_unpacked new)
636 {
637 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
638 union bch_extent_crc *crc = (void *) ptrs.end;
639 enum bch_extent_entry_type type;
640
641 if (bch_crc_bytes[new.csum_type] <= 4 &&
642 new.uncompressed_size <= CRC32_SIZE_MAX &&
643 new.nonce <= CRC32_NONCE_MAX)
644 type = BCH_EXTENT_ENTRY_crc32;
645 else if (bch_crc_bytes[new.csum_type] <= 10 &&
646 new.uncompressed_size <= CRC64_SIZE_MAX &&
647 new.nonce <= CRC64_NONCE_MAX)
648 type = BCH_EXTENT_ENTRY_crc64;
649 else if (bch_crc_bytes[new.csum_type] <= 16 &&
650 new.uncompressed_size <= CRC128_SIZE_MAX &&
651 new.nonce <= CRC128_NONCE_MAX)
652 type = BCH_EXTENT_ENTRY_crc128;
653 else
654 BUG();
655
656 bch2_extent_crc_pack(crc, new, type);
657
658 k->k.u64s += extent_entry_u64s(ptrs.end);
659
660 EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
661 }
662
663 /* Generic code for keys with pointers: */
664
bch2_bkey_nr_ptrs(struct bkey_s_c k)665 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
666 {
667 return bch2_bkey_devs(k).nr;
668 }
669
bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)670 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
671 {
672 return k.k->type == KEY_TYPE_reservation
673 ? bkey_s_c_to_reservation(k).v->nr_replicas
674 : bch2_bkey_dirty_devs(k).nr;
675 }
676
bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)677 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
678 {
679 unsigned ret = 0;
680
681 if (k.k->type == KEY_TYPE_reservation) {
682 ret = bkey_s_c_to_reservation(k).v->nr_replicas;
683 } else {
684 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
685 const union bch_extent_entry *entry;
686 struct extent_ptr_decoded p;
687
688 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
689 ret += !p.ptr.cached && !crc_is_compressed(p.crc);
690 }
691
692 return ret;
693 }
694
bch2_bkey_sectors_compressed(struct bkey_s_c k)695 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
696 {
697 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
698 const union bch_extent_entry *entry;
699 struct extent_ptr_decoded p;
700 unsigned ret = 0;
701
702 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
703 if (!p.ptr.cached && crc_is_compressed(p.crc))
704 ret += p.crc.compressed_size;
705
706 return ret;
707 }
708
bch2_bkey_is_incompressible(struct bkey_s_c k)709 bool bch2_bkey_is_incompressible(struct bkey_s_c k)
710 {
711 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
712 const union bch_extent_entry *entry;
713 struct bch_extent_crc_unpacked crc;
714
715 bkey_for_each_crc(k.k, ptrs, crc, entry)
716 if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
717 return true;
718 return false;
719 }
720
bch2_bkey_replicas(struct bch_fs * c,struct bkey_s_c k)721 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
722 {
723 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
724 const union bch_extent_entry *entry;
725 struct extent_ptr_decoded p = { 0 };
726 unsigned replicas = 0;
727
728 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
729 if (p.ptr.cached)
730 continue;
731
732 if (p.has_ec)
733 replicas += p.ec.redundancy;
734
735 replicas++;
736
737 }
738
739 return replicas;
740 }
741
__extent_ptr_durability(struct bch_dev * ca,struct extent_ptr_decoded * p)742 static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
743 {
744 if (p->ptr.cached)
745 return 0;
746
747 return p->has_ec
748 ? p->ec.redundancy + 1
749 : ca->mi.durability;
750 }
751
bch2_extent_ptr_desired_durability(struct bch_fs * c,struct extent_ptr_decoded * p)752 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
753 {
754 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
755
756 return ca ? __extent_ptr_durability(ca, p) : 0;
757 }
758
bch2_extent_ptr_durability(struct bch_fs * c,struct extent_ptr_decoded * p)759 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
760 {
761 struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
762
763 if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
764 return 0;
765
766 return __extent_ptr_durability(ca, p);
767 }
768
bch2_bkey_durability(struct bch_fs * c,struct bkey_s_c k)769 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
770 {
771 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
772 const union bch_extent_entry *entry;
773 struct extent_ptr_decoded p;
774 unsigned durability = 0;
775
776 rcu_read_lock();
777 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
778 durability += bch2_extent_ptr_durability(c, &p);
779 rcu_read_unlock();
780
781 return durability;
782 }
783
bch2_bkey_durability_safe(struct bch_fs * c,struct bkey_s_c k)784 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
785 {
786 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
787 const union bch_extent_entry *entry;
788 struct extent_ptr_decoded p;
789 unsigned durability = 0;
790
791 rcu_read_lock();
792 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
793 if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
794 durability += bch2_extent_ptr_durability(c, &p);
795 rcu_read_unlock();
796
797 return durability;
798 }
799
bch2_bkey_extent_entry_drop(struct bkey_i * k,union bch_extent_entry * entry)800 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
801 {
802 union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
803 union bch_extent_entry *next = extent_entry_next(entry);
804
805 memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
806 k->k.u64s -= extent_entry_u64s(entry);
807 }
808
bch2_extent_ptr_decoded_append(struct bkey_i * k,struct extent_ptr_decoded * p)809 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
810 struct extent_ptr_decoded *p)
811 {
812 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
813 struct bch_extent_crc_unpacked crc =
814 bch2_extent_crc_unpack(&k->k, NULL);
815 union bch_extent_entry *pos;
816
817 if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
818 pos = ptrs.start;
819 goto found;
820 }
821
822 bkey_for_each_crc(&k->k, ptrs, crc, pos)
823 if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
824 pos = extent_entry_next(pos);
825 goto found;
826 }
827
828 bch2_extent_crc_append(k, p->crc);
829 pos = bkey_val_end(bkey_i_to_s(k));
830 found:
831 p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
832 __extent_entry_insert(k, pos, to_entry(&p->ptr));
833
834 if (p->has_ec) {
835 p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
836 __extent_entry_insert(k, pos, to_entry(&p->ec));
837 }
838 }
839
extent_entry_prev(struct bkey_ptrs ptrs,union bch_extent_entry * entry)840 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
841 union bch_extent_entry *entry)
842 {
843 union bch_extent_entry *i = ptrs.start;
844
845 if (i == entry)
846 return NULL;
847
848 while (extent_entry_next(i) != entry)
849 i = extent_entry_next(i);
850 return i;
851 }
852
853 /*
854 * Returns pointer to the next entry after the one being dropped:
855 */
bch2_bkey_drop_ptr_noerror(struct bkey_s k,struct bch_extent_ptr * ptr)856 void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
857 {
858 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
859 union bch_extent_entry *entry = to_entry(ptr), *next;
860 bool drop_crc = true;
861
862 if (k.k->type == KEY_TYPE_stripe) {
863 ptr->dev = BCH_SB_MEMBER_INVALID;
864 return;
865 }
866
867 EBUG_ON(ptr < &ptrs.start->ptr ||
868 ptr >= &ptrs.end->ptr);
869 EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
870
871 for (next = extent_entry_next(entry);
872 next != ptrs.end;
873 next = extent_entry_next(next)) {
874 if (extent_entry_is_crc(next)) {
875 break;
876 } else if (extent_entry_is_ptr(next)) {
877 drop_crc = false;
878 break;
879 }
880 }
881
882 extent_entry_drop(k, entry);
883
884 while ((entry = extent_entry_prev(ptrs, entry))) {
885 if (extent_entry_is_ptr(entry))
886 break;
887
888 if ((extent_entry_is_crc(entry) && drop_crc) ||
889 extent_entry_is_stripe_ptr(entry))
890 extent_entry_drop(k, entry);
891 }
892 }
893
bch2_bkey_drop_ptr(struct bkey_s k,struct bch_extent_ptr * ptr)894 void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
895 {
896 if (k.k->type != KEY_TYPE_stripe) {
897 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
898 const union bch_extent_entry *entry;
899 struct extent_ptr_decoded p;
900
901 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
902 if (p.ptr.dev == ptr->dev && p.has_ec) {
903 ptr->dev = BCH_SB_MEMBER_INVALID;
904 return;
905 }
906 }
907
908 bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
909
910 bch2_bkey_drop_ptr_noerror(k, ptr);
911
912 /*
913 * If we deleted all the dirty pointers and there's still cached
914 * pointers, we could set the cached pointers to dirty if they're not
915 * stale - but to do that correctly we'd need to grab an open_bucket
916 * reference so that we don't race with bucket reuse:
917 */
918 if (have_dirty &&
919 !bch2_bkey_dirty_devs(k.s_c).nr) {
920 k.k->type = KEY_TYPE_error;
921 set_bkey_val_u64s(k.k, 0);
922 } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
923 k.k->type = KEY_TYPE_deleted;
924 set_bkey_val_u64s(k.k, 0);
925 }
926 }
927
bch2_bkey_drop_device(struct bkey_s k,unsigned dev)928 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
929 {
930 bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
931 }
932
bch2_bkey_drop_device_noerror(struct bkey_s k,unsigned dev)933 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
934 {
935 bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
936 }
937
bch2_bkey_has_device_c(struct bkey_s_c k,unsigned dev)938 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
939 {
940 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
941
942 bkey_for_each_ptr(ptrs, ptr)
943 if (ptr->dev == dev)
944 return ptr;
945
946 return NULL;
947 }
948
bch2_bkey_has_target(struct bch_fs * c,struct bkey_s_c k,unsigned target)949 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
950 {
951 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
952 struct bch_dev *ca;
953 bool ret = false;
954
955 rcu_read_lock();
956 bkey_for_each_ptr(ptrs, ptr)
957 if (bch2_dev_in_target(c, ptr->dev, target) &&
958 (ca = bch2_dev_rcu(c, ptr->dev)) &&
959 (!ptr->cached ||
960 !dev_ptr_stale_rcu(ca, ptr))) {
961 ret = true;
962 break;
963 }
964 rcu_read_unlock();
965
966 return ret;
967 }
968
bch2_bkey_matches_ptr(struct bch_fs * c,struct bkey_s_c k,struct bch_extent_ptr m,u64 offset)969 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
970 struct bch_extent_ptr m, u64 offset)
971 {
972 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
973 const union bch_extent_entry *entry;
974 struct extent_ptr_decoded p;
975
976 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
977 if (p.ptr.dev == m.dev &&
978 p.ptr.gen == m.gen &&
979 (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
980 (s64) m.offset - offset)
981 return true;
982
983 return false;
984 }
985
986 /*
987 * Returns true if two extents refer to the same data:
988 */
bch2_extents_match(struct bkey_s_c k1,struct bkey_s_c k2)989 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
990 {
991 if (k1.k->type != k2.k->type)
992 return false;
993
994 if (bkey_extent_is_direct_data(k1.k)) {
995 struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
996 struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
997 const union bch_extent_entry *entry1, *entry2;
998 struct extent_ptr_decoded p1, p2;
999
1000 if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
1001 return false;
1002
1003 bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
1004 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1005 if (p1.ptr.dev == p2.ptr.dev &&
1006 p1.ptr.gen == p2.ptr.gen &&
1007
1008 /*
1009 * This checks that the two pointers point
1010 * to the same region on disk - adjusting
1011 * for the difference in where the extents
1012 * start, since one may have been trimmed:
1013 */
1014 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1015 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
1016
1017 /*
1018 * This additionally checks that the
1019 * extents overlap on disk, since the
1020 * previous check may trigger spuriously
1021 * when one extent is immediately partially
1022 * overwritten with another extent (so that
1023 * on disk they are adjacent) and
1024 * compression is in use:
1025 */
1026 ((p1.ptr.offset >= p2.ptr.offset &&
1027 p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
1028 (p2.ptr.offset >= p1.ptr.offset &&
1029 p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
1030 return true;
1031
1032 return false;
1033 } else {
1034 /* KEY_TYPE_deleted, etc. */
1035 return true;
1036 }
1037 }
1038
1039 struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c k1,struct extent_ptr_decoded p1,struct bkey_s k2)1040 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
1041 {
1042 struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
1043 union bch_extent_entry *entry2;
1044 struct extent_ptr_decoded p2;
1045
1046 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1047 if (p1.ptr.dev == p2.ptr.dev &&
1048 p1.ptr.gen == p2.ptr.gen &&
1049 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1050 (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
1051 return &entry2->ptr;
1052
1053 return NULL;
1054 }
1055
want_cached_ptr(struct bch_fs * c,struct bch_io_opts * opts,struct bch_extent_ptr * ptr)1056 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
1057 struct bch_extent_ptr *ptr)
1058 {
1059 unsigned target = opts->promote_target ?: opts->foreground_target;
1060
1061 if (target && !bch2_dev_in_target(c, ptr->dev, target))
1062 return false;
1063
1064 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1065
1066 return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
1067 }
1068
bch2_extent_ptr_set_cached(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k,struct bch_extent_ptr * ptr)1069 void bch2_extent_ptr_set_cached(struct bch_fs *c,
1070 struct bch_io_opts *opts,
1071 struct bkey_s k,
1072 struct bch_extent_ptr *ptr)
1073 {
1074 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1075 union bch_extent_entry *entry;
1076 struct extent_ptr_decoded p;
1077
1078 rcu_read_lock();
1079 if (!want_cached_ptr(c, opts, ptr)) {
1080 bch2_bkey_drop_ptr_noerror(k, ptr);
1081 goto out;
1082 }
1083
1084 /*
1085 * Stripes can't contain cached data, for - reasons.
1086 *
1087 * Possibly something we can fix in the future?
1088 */
1089 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1090 if (&entry->ptr == ptr) {
1091 if (p.has_ec)
1092 bch2_bkey_drop_ptr_noerror(k, ptr);
1093 else
1094 ptr->cached = true;
1095 goto out;
1096 }
1097
1098 BUG();
1099 out:
1100 rcu_read_unlock();
1101 }
1102
1103 /*
1104 * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1105 *
1106 * Returns true if @k should be dropped entirely
1107 *
1108 * For existing keys, only called when btree nodes are being rewritten, not when
1109 * they're merely being compacted/resorted in memory.
1110 */
bch2_extent_normalize(struct bch_fs * c,struct bkey_s k)1111 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1112 {
1113 struct bch_dev *ca;
1114
1115 rcu_read_lock();
1116 bch2_bkey_drop_ptrs(k, ptr,
1117 ptr->cached &&
1118 (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
1119 dev_ptr_stale_rcu(ca, ptr) > 0));
1120 rcu_read_unlock();
1121
1122 return bkey_deleted(k.k);
1123 }
1124
1125 /*
1126 * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1127 *
1128 * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1129 * the promote target.
1130 */
bch2_extent_normalize_by_opts(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k)1131 bool bch2_extent_normalize_by_opts(struct bch_fs *c,
1132 struct bch_io_opts *opts,
1133 struct bkey_s k)
1134 {
1135 struct bkey_ptrs ptrs;
1136 bool have_cached_ptr;
1137
1138 rcu_read_lock();
1139 restart_drop_ptrs:
1140 ptrs = bch2_bkey_ptrs(k);
1141 have_cached_ptr = false;
1142
1143 bkey_for_each_ptr(ptrs, ptr)
1144 if (ptr->cached) {
1145 if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
1146 bch2_bkey_drop_ptr(k, ptr);
1147 goto restart_drop_ptrs;
1148 }
1149 have_cached_ptr = true;
1150 }
1151 rcu_read_unlock();
1152
1153 return bkey_deleted(k.k);
1154 }
1155
bch2_extent_ptr_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_ptr * ptr)1156 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
1157 {
1158 out->atomic++;
1159 rcu_read_lock();
1160 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1161 if (!ca) {
1162 prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1163 (u64) ptr->offset, ptr->gen,
1164 ptr->cached ? " cached" : "");
1165 } else {
1166 u32 offset;
1167 u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1168
1169 prt_printf(out, "ptr: %u:%llu:%u gen %u",
1170 ptr->dev, b, offset, ptr->gen);
1171 if (ca->mi.durability != 1)
1172 prt_printf(out, " d=%u", ca->mi.durability);
1173 if (ptr->cached)
1174 prt_str(out, " cached");
1175 if (ptr->unwritten)
1176 prt_str(out, " unwritten");
1177 int stale = dev_ptr_stale_rcu(ca, ptr);
1178 if (stale > 0)
1179 prt_printf(out, " stale");
1180 else if (stale)
1181 prt_printf(out, " invalid");
1182 }
1183 rcu_read_unlock();
1184 --out->atomic;
1185 }
1186
bch2_extent_crc_unpacked_to_text(struct printbuf * out,struct bch_extent_crc_unpacked * crc)1187 void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
1188 {
1189 prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
1190 crc->compressed_size,
1191 crc->uncompressed_size,
1192 crc->offset, crc->nonce);
1193 bch2_prt_csum_type(out, crc->csum_type);
1194 prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
1195 prt_str(out, " compress ");
1196 bch2_prt_compression_type(out, crc->compression_type);
1197 }
1198
bch2_extent_rebalance_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_rebalance * r)1199 static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
1200 const struct bch_extent_rebalance *r)
1201 {
1202 prt_str(out, "rebalance:");
1203
1204 prt_printf(out, " replicas=%u", r->data_replicas);
1205 if (r->data_replicas_from_inode)
1206 prt_str(out, " (inode)");
1207
1208 prt_str(out, " checksum=");
1209 bch2_prt_csum_opt(out, r->data_checksum);
1210 if (r->data_checksum_from_inode)
1211 prt_str(out, " (inode)");
1212
1213 if (r->background_compression || r->background_compression_from_inode) {
1214 prt_str(out, " background_compression=");
1215 bch2_compression_opt_to_text(out, r->background_compression);
1216
1217 if (r->background_compression_from_inode)
1218 prt_str(out, " (inode)");
1219 }
1220
1221 if (r->background_target || r->background_target_from_inode) {
1222 prt_str(out, " background_target=");
1223 if (c)
1224 bch2_target_to_text(out, c, r->background_target);
1225 else
1226 prt_printf(out, "%u", r->background_target);
1227
1228 if (r->background_target_from_inode)
1229 prt_str(out, " (inode)");
1230 }
1231
1232 if (r->promote_target || r->promote_target_from_inode) {
1233 prt_str(out, " promote_target=");
1234 if (c)
1235 bch2_target_to_text(out, c, r->promote_target);
1236 else
1237 prt_printf(out, "%u", r->promote_target);
1238
1239 if (r->promote_target_from_inode)
1240 prt_str(out, " (inode)");
1241 }
1242
1243 if (r->erasure_code || r->erasure_code_from_inode) {
1244 prt_printf(out, " ec=%u", r->erasure_code);
1245 if (r->erasure_code_from_inode)
1246 prt_str(out, " (inode)");
1247 }
1248 }
1249
bch2_bkey_ptrs_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)1250 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1251 struct bkey_s_c k)
1252 {
1253 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1254 const union bch_extent_entry *entry;
1255 bool first = true;
1256
1257 if (c)
1258 prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1259
1260 bkey_extent_entry_for_each(ptrs, entry) {
1261 if (!first)
1262 prt_printf(out, " ");
1263
1264 switch (__extent_entry_type(entry)) {
1265 case BCH_EXTENT_ENTRY_ptr:
1266 bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
1267 break;
1268
1269 case BCH_EXTENT_ENTRY_crc32:
1270 case BCH_EXTENT_ENTRY_crc64:
1271 case BCH_EXTENT_ENTRY_crc128: {
1272 struct bch_extent_crc_unpacked crc =
1273 bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1274
1275 bch2_extent_crc_unpacked_to_text(out, &crc);
1276 break;
1277 }
1278 case BCH_EXTENT_ENTRY_stripe_ptr: {
1279 const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
1280
1281 prt_printf(out, "ec: idx %llu block %u",
1282 (u64) ec->idx, ec->block);
1283 break;
1284 }
1285 case BCH_EXTENT_ENTRY_rebalance:
1286 bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
1287 break;
1288
1289 case BCH_EXTENT_ENTRY_flags:
1290 prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
1291 break;
1292
1293 default:
1294 prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1295 return;
1296 }
1297
1298 first = false;
1299 }
1300 }
1301
extent_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from,const struct bch_extent_ptr * ptr,unsigned size_ondisk,bool metadata)1302 static int extent_ptr_validate(struct bch_fs *c,
1303 struct bkey_s_c k,
1304 struct bkey_validate_context from,
1305 const struct bch_extent_ptr *ptr,
1306 unsigned size_ondisk,
1307 bool metadata)
1308 {
1309 int ret = 0;
1310
1311 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1312 bkey_for_each_ptr(ptrs, ptr2)
1313 bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
1314 c, ptr_to_duplicate_device,
1315 "multiple pointers to same device (%u)", ptr->dev);
1316
1317 /* bad pointers are repaired by check_fix_ptrs(): */
1318 rcu_read_lock();
1319 struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1320 if (!ca) {
1321 rcu_read_unlock();
1322 return 0;
1323 }
1324 u32 bucket_offset;
1325 u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1326 unsigned first_bucket = ca->mi.first_bucket;
1327 u64 nbuckets = ca->mi.nbuckets;
1328 unsigned bucket_size = ca->mi.bucket_size;
1329 rcu_read_unlock();
1330
1331 bkey_fsck_err_on(bucket >= nbuckets,
1332 c, ptr_after_last_bucket,
1333 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
1334 bkey_fsck_err_on(bucket < first_bucket,
1335 c, ptr_before_first_bucket,
1336 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
1337 bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
1338 c, ptr_spans_multiple_buckets,
1339 "pointer spans multiple buckets (%u + %u > %u)",
1340 bucket_offset, size_ondisk, bucket_size);
1341 fsck_err:
1342 return ret;
1343 }
1344
bch2_bkey_ptrs_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)1345 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
1346 struct bkey_validate_context from)
1347 {
1348 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1349 const union bch_extent_entry *entry;
1350 struct bch_extent_crc_unpacked crc;
1351 unsigned size_ondisk = k.k->size;
1352 unsigned nonce = UINT_MAX;
1353 unsigned nr_ptrs = 0;
1354 bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
1355 int ret = 0;
1356
1357 if (bkey_is_btree_ptr(k.k))
1358 size_ondisk = btree_sectors(c);
1359
1360 bkey_extent_entry_for_each(ptrs, entry) {
1361 bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
1362 c, extent_ptrs_invalid_entry,
1363 "invalid extent entry type (got %u, max %u)",
1364 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1365
1366 bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
1367 !extent_entry_is_ptr(entry),
1368 c, btree_ptr_has_non_ptr,
1369 "has non ptr field");
1370
1371 switch (extent_entry_type(entry)) {
1372 case BCH_EXTENT_ENTRY_ptr:
1373 ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
1374 if (ret)
1375 return ret;
1376
1377 bkey_fsck_err_on(entry->ptr.cached && have_ec,
1378 c, ptr_cached_and_erasure_coded,
1379 "cached, erasure coded ptr");
1380
1381 if (!entry->ptr.unwritten)
1382 have_written = true;
1383 else
1384 have_unwritten = true;
1385
1386 have_ec = false;
1387 crc_since_last_ptr = false;
1388 nr_ptrs++;
1389 break;
1390 case BCH_EXTENT_ENTRY_crc32:
1391 case BCH_EXTENT_ENTRY_crc64:
1392 case BCH_EXTENT_ENTRY_crc128:
1393 crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1394
1395 bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
1396 c, ptr_crc_csum_type_unknown,
1397 "invalid checksum type");
1398 bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
1399 c, ptr_crc_compression_type_unknown,
1400 "invalid compression type");
1401
1402 bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
1403 c, ptr_crc_uncompressed_size_too_small,
1404 "checksum offset + key size > uncompressed size");
1405 bkey_fsck_err_on(crc_is_encoded(crc) &&
1406 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
1407 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
1408 c, ptr_crc_uncompressed_size_too_big,
1409 "too large encoded extent");
1410 bkey_fsck_err_on(!crc_is_compressed(crc) &&
1411 crc.compressed_size != crc.uncompressed_size,
1412 c, ptr_crc_uncompressed_size_mismatch,
1413 "not compressed but compressed != uncompressed size");
1414
1415 if (bch2_csum_type_is_encryption(crc.csum_type)) {
1416 if (nonce == UINT_MAX)
1417 nonce = crc.offset + crc.nonce;
1418 else if (nonce != crc.offset + crc.nonce)
1419 bkey_fsck_err(c, ptr_crc_nonce_mismatch,
1420 "incorrect nonce");
1421 }
1422
1423 bkey_fsck_err_on(crc_since_last_ptr,
1424 c, ptr_crc_redundant,
1425 "redundant crc entry");
1426 crc_since_last_ptr = true;
1427
1428 size_ondisk = crc.compressed_size;
1429 break;
1430 case BCH_EXTENT_ENTRY_stripe_ptr:
1431 bkey_fsck_err_on(have_ec,
1432 c, ptr_stripe_redundant,
1433 "redundant stripe entry");
1434 have_ec = true;
1435 break;
1436 case BCH_EXTENT_ENTRY_rebalance: {
1437 /*
1438 * this shouldn't be a fsck error, for forward
1439 * compatibility; the rebalance code should just refetch
1440 * the compression opt if it's unknown
1441 */
1442 #if 0
1443 const struct bch_extent_rebalance *r = &entry->rebalance;
1444
1445 if (!bch2_compression_opt_valid(r->compression)) {
1446 struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
1447 prt_printf(err, "invalid compression opt %u:%u",
1448 opt.type, opt.level);
1449 return -BCH_ERR_invalid_bkey;
1450 }
1451 #endif
1452 break;
1453 }
1454 case BCH_EXTENT_ENTRY_flags:
1455 bkey_fsck_err_on(entry != ptrs.start,
1456 c, extent_flags_not_at_start,
1457 "extent flags entry not at start");
1458 break;
1459 }
1460 }
1461
1462 bkey_fsck_err_on(!nr_ptrs,
1463 c, extent_ptrs_no_ptrs,
1464 "no ptrs");
1465 bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
1466 c, extent_ptrs_too_many_ptrs,
1467 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
1468 bkey_fsck_err_on(have_written && have_unwritten,
1469 c, extent_ptrs_written_and_unwritten,
1470 "extent with unwritten and written ptrs");
1471 bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
1472 c, extent_ptrs_unwritten,
1473 "has unwritten ptrs");
1474 bkey_fsck_err_on(crc_since_last_ptr,
1475 c, extent_ptrs_redundant_crc,
1476 "redundant crc entry");
1477 bkey_fsck_err_on(have_ec,
1478 c, extent_ptrs_redundant_stripe,
1479 "redundant stripe entry");
1480 fsck_err:
1481 return ret;
1482 }
1483
bch2_ptr_swab(struct bkey_s k)1484 void bch2_ptr_swab(struct bkey_s k)
1485 {
1486 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1487 union bch_extent_entry *entry;
1488 u64 *d;
1489
1490 for (d = (u64 *) ptrs.start;
1491 d != (u64 *) ptrs.end;
1492 d++)
1493 *d = swab64(*d);
1494
1495 for (entry = ptrs.start;
1496 entry < ptrs.end;
1497 entry = extent_entry_next(entry)) {
1498 switch (__extent_entry_type(entry)) {
1499 case BCH_EXTENT_ENTRY_ptr:
1500 break;
1501 case BCH_EXTENT_ENTRY_crc32:
1502 entry->crc32.csum = swab32(entry->crc32.csum);
1503 break;
1504 case BCH_EXTENT_ENTRY_crc64:
1505 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1506 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1507 break;
1508 case BCH_EXTENT_ENTRY_crc128:
1509 entry->crc128.csum.hi = (__force __le64)
1510 swab64((__force u64) entry->crc128.csum.hi);
1511 entry->crc128.csum.lo = (__force __le64)
1512 swab64((__force u64) entry->crc128.csum.lo);
1513 break;
1514 case BCH_EXTENT_ENTRY_stripe_ptr:
1515 break;
1516 case BCH_EXTENT_ENTRY_rebalance:
1517 break;
1518 default:
1519 /* Bad entry type: will be caught by validate() */
1520 return;
1521 }
1522 }
1523 }
1524
bch2_bkey_extent_flags_set(struct bch_fs * c,struct bkey_i * k,u64 flags)1525 int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
1526 {
1527 int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
1528 if (ret)
1529 return ret;
1530
1531 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
1532
1533 if (ptrs.start != ptrs.end &&
1534 extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
1535 ptrs.start->flags.flags = flags;
1536 } else {
1537 struct bch_extent_flags f = {
1538 .type = BIT(BCH_EXTENT_ENTRY_flags),
1539 .flags = flags,
1540 };
1541 __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
1542 }
1543
1544 return 0;
1545 }
1546
1547 /* Generic extent code: */
1548
bch2_cut_front_s(struct bpos where,struct bkey_s k)1549 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1550 {
1551 unsigned new_val_u64s = bkey_val_u64s(k.k);
1552 int val_u64s_delta;
1553 u64 sub;
1554
1555 if (bkey_le(where, bkey_start_pos(k.k)))
1556 return 0;
1557
1558 EBUG_ON(bkey_gt(where, k.k->p));
1559
1560 sub = where.offset - bkey_start_offset(k.k);
1561
1562 k.k->size -= sub;
1563
1564 if (!k.k->size) {
1565 k.k->type = KEY_TYPE_deleted;
1566 new_val_u64s = 0;
1567 }
1568
1569 switch (k.k->type) {
1570 case KEY_TYPE_extent:
1571 case KEY_TYPE_reflink_v: {
1572 struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1573 union bch_extent_entry *entry;
1574 bool seen_crc = false;
1575
1576 bkey_extent_entry_for_each(ptrs, entry) {
1577 switch (extent_entry_type(entry)) {
1578 case BCH_EXTENT_ENTRY_ptr:
1579 if (!seen_crc)
1580 entry->ptr.offset += sub;
1581 break;
1582 case BCH_EXTENT_ENTRY_crc32:
1583 entry->crc32.offset += sub;
1584 break;
1585 case BCH_EXTENT_ENTRY_crc64:
1586 entry->crc64.offset += sub;
1587 break;
1588 case BCH_EXTENT_ENTRY_crc128:
1589 entry->crc128.offset += sub;
1590 break;
1591 case BCH_EXTENT_ENTRY_stripe_ptr:
1592 case BCH_EXTENT_ENTRY_rebalance:
1593 case BCH_EXTENT_ENTRY_flags:
1594 break;
1595 }
1596
1597 if (extent_entry_is_crc(entry))
1598 seen_crc = true;
1599 }
1600
1601 break;
1602 }
1603 case KEY_TYPE_reflink_p: {
1604 struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1605
1606 SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
1607 break;
1608 }
1609 case KEY_TYPE_inline_data:
1610 case KEY_TYPE_indirect_inline_data: {
1611 void *p = bkey_inline_data_p(k);
1612 unsigned bytes = bkey_inline_data_bytes(k.k);
1613
1614 sub = min_t(u64, sub << 9, bytes);
1615
1616 memmove(p, p + sub, bytes - sub);
1617
1618 new_val_u64s -= sub >> 3;
1619 break;
1620 }
1621 }
1622
1623 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1624 BUG_ON(val_u64s_delta < 0);
1625
1626 set_bkey_val_u64s(k.k, new_val_u64s);
1627 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1628 return -val_u64s_delta;
1629 }
1630
bch2_cut_back_s(struct bpos where,struct bkey_s k)1631 int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1632 {
1633 unsigned new_val_u64s = bkey_val_u64s(k.k);
1634 int val_u64s_delta;
1635 u64 len = 0;
1636
1637 if (bkey_ge(where, k.k->p))
1638 return 0;
1639
1640 EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1641
1642 len = where.offset - bkey_start_offset(k.k);
1643
1644 k.k->p.offset = where.offset;
1645 k.k->size = len;
1646
1647 if (!len) {
1648 k.k->type = KEY_TYPE_deleted;
1649 new_val_u64s = 0;
1650 }
1651
1652 switch (k.k->type) {
1653 case KEY_TYPE_inline_data:
1654 case KEY_TYPE_indirect_inline_data:
1655 new_val_u64s = (bkey_inline_data_offset(k.k) +
1656 min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1657 break;
1658 }
1659
1660 val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1661 BUG_ON(val_u64s_delta < 0);
1662
1663 set_bkey_val_u64s(k.k, new_val_u64s);
1664 memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1665 return -val_u64s_delta;
1666 }
1667