1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  *
5  * Code for managing the extent btree and dynamically updating the writeback
6  * dirty sector count.
7  */
8 
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_cache.h"
12 #include "btree_gc.h"
13 #include "btree_io.h"
14 #include "btree_iter.h"
15 #include "buckets.h"
16 #include "checksum.h"
17 #include "compress.h"
18 #include "debug.h"
19 #include "disk_groups.h"
20 #include "error.h"
21 #include "extents.h"
22 #include "inode.h"
23 #include "journal.h"
24 #include "rebalance.h"
25 #include "replicas.h"
26 #include "super.h"
27 #include "super-io.h"
28 #include "trace.h"
29 #include "util.h"
30 
31 static const char * const bch2_extent_flags_strs[] = {
32 #define x(n, v)	[BCH_EXTENT_FLAG_##n] = #n,
33 	BCH_EXTENT_FLAGS()
34 #undef x
35 	NULL,
36 };
37 
38 static unsigned bch2_crc_field_size_max[] = {
39 	[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
40 	[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
41 	[BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
42 };
43 
44 static void bch2_extent_crc_pack(union bch_extent_crc *,
45 				 struct bch_extent_crc_unpacked,
46 				 enum bch_extent_entry_type);
47 
bch2_dev_io_failures(struct bch_io_failures * f,unsigned dev)48 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
49 						 unsigned dev)
50 {
51 	struct bch_dev_io_failures *i;
52 
53 	for (i = f->devs; i < f->devs + f->nr; i++)
54 		if (i->dev == dev)
55 			return i;
56 
57 	return NULL;
58 }
59 
bch2_mark_io_failure(struct bch_io_failures * failed,struct extent_ptr_decoded * p,bool csum_error)60 void bch2_mark_io_failure(struct bch_io_failures *failed,
61 			  struct extent_ptr_decoded *p,
62 			  bool csum_error)
63 {
64 	struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
65 
66 	if (!f) {
67 		BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
68 
69 		f = &failed->devs[failed->nr++];
70 		memset(f, 0, sizeof(*f));
71 		f->dev = p->ptr.dev;
72 	}
73 
74 	if (p->do_ec_reconstruct)
75 		f->failed_ec = true;
76 	else if (!csum_error)
77 		f->failed_io = true;
78 	else
79 		f->failed_csum_nr++;
80 }
81 
dev_latency(struct bch_dev * ca)82 static inline u64 dev_latency(struct bch_dev *ca)
83 {
84 	return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
85 }
86 
dev_failed(struct bch_dev * ca)87 static inline int dev_failed(struct bch_dev *ca)
88 {
89 	return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
90 }
91 
92 /*
93  * returns true if p1 is better than p2:
94  */
ptr_better(struct bch_fs * c,const struct extent_ptr_decoded p1,u64 p1_latency,struct bch_dev * ca1,const struct extent_ptr_decoded p2,u64 p2_latency)95 static inline bool ptr_better(struct bch_fs *c,
96 			      const struct extent_ptr_decoded p1,
97 			      u64 p1_latency,
98 			      struct bch_dev *ca1,
99 			      const struct extent_ptr_decoded p2,
100 			      u64 p2_latency)
101 {
102 	struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
103 
104 	int failed_delta = dev_failed(ca1) - dev_failed(ca2);
105 	if (unlikely(failed_delta))
106 		return failed_delta < 0;
107 
108 	if (unlikely(bch2_force_reconstruct_read))
109 		return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
110 
111 	if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
112 		return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
113 
114 	int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
115 	if (unlikely(crc_retry_delta))
116 		return crc_retry_delta < 0;
117 
118 	/* Pick at random, biased in favor of the faster device: */
119 
120 	return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
121 }
122 
123 /*
124  * This picks a non-stale pointer, preferably from a device other than @avoid.
125  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
126  * other devices, it will still pick a pointer from avoid.
127  */
bch2_bkey_pick_read_device(struct bch_fs * c,struct bkey_s_c k,struct bch_io_failures * failed,struct extent_ptr_decoded * pick,int dev)128 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
129 			       struct bch_io_failures *failed,
130 			       struct extent_ptr_decoded *pick,
131 			       int dev)
132 {
133 	bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
134 	bool have_dirty_ptrs = false, have_pick = false;
135 
136 	if (k.k->type == KEY_TYPE_error)
137 		return -BCH_ERR_key_type_error;
138 
139 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
140 
141 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
142 		return -BCH_ERR_extent_poisoned;
143 
144 	rcu_read_lock();
145 	const union bch_extent_entry *entry;
146 	struct extent_ptr_decoded p;
147 	u64 pick_latency;
148 
149 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
150 		have_dirty_ptrs |= !p.ptr.cached;
151 
152 		/*
153 		 * Unwritten extent: no need to actually read, treat it as a
154 		 * hole and return 0s:
155 		 */
156 		if (p.ptr.unwritten) {
157 			rcu_read_unlock();
158 			return 0;
159 		}
160 
161 		/* Are we being asked to read from a specific device? */
162 		if (dev >= 0 && p.ptr.dev != dev)
163 			continue;
164 
165 		struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
166 
167 		if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
168 			continue;
169 
170 		struct bch_dev_io_failures *f =
171 			unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
172 		if (unlikely(f)) {
173 			p.crc_retry_nr	   = f->failed_csum_nr;
174 			p.has_ec	  &= ~f->failed_ec;
175 
176 			if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
177 				have_io_errors	|= f->failed_io;
178 				have_io_errors	|= f->failed_ec;
179 			}
180 			have_csum_errors	|= !!f->failed_csum_nr;
181 
182 			if (p.has_ec && (f->failed_io || f->failed_csum_nr))
183 				p.do_ec_reconstruct = true;
184 			else if (f->failed_io ||
185 				 f->failed_csum_nr > c->opts.checksum_err_retry_nr)
186 				continue;
187 		}
188 
189 		have_missing_devs |= ca && !bch2_dev_is_online(ca);
190 
191 		if (!ca || !bch2_dev_is_online(ca)) {
192 			if (!p.has_ec)
193 				continue;
194 			p.do_ec_reconstruct = true;
195 		}
196 
197 		if (bch2_force_reconstruct_read && p.has_ec)
198 			p.do_ec_reconstruct = true;
199 
200 		u64 p_latency = dev_latency(ca);
201 		/*
202 		 * Square the latencies, to bias more in favor of the faster
203 		 * device - we never want to stop issuing reads to the slower
204 		 * device altogether, so that we can update our latency numbers:
205 		 */
206 		p_latency *= p_latency;
207 
208 		if (!have_pick ||
209 		    ptr_better(c,
210 			       p, p_latency, ca,
211 			       *pick, pick_latency)) {
212 			*pick = p;
213 			pick_latency = p_latency;
214 			have_pick = true;
215 		}
216 	}
217 	rcu_read_unlock();
218 
219 	if (have_pick)
220 		return 1;
221 	if (!have_dirty_ptrs)
222 		return 0;
223 	if (have_missing_devs)
224 		return -BCH_ERR_no_device_to_read_from;
225 	if (have_csum_errors)
226 		return -BCH_ERR_data_read_csum_err;
227 	if (have_io_errors)
228 		return -BCH_ERR_data_read_io_err;
229 
230 	/*
231 	 * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
232 	 * but they don't point to valid devices:
233 	 */
234 	return -BCH_ERR_no_devices_valid;
235 }
236 
237 /* KEY_TYPE_btree_ptr: */
238 
bch2_btree_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)239 int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
240 			    struct bkey_validate_context from)
241 {
242 	int ret = 0;
243 
244 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
245 			 c, btree_ptr_val_too_big,
246 			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
247 
248 	ret = bch2_bkey_ptrs_validate(c, k, from);
249 fsck_err:
250 	return ret;
251 }
252 
bch2_btree_ptr_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)253 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
254 			    struct bkey_s_c k)
255 {
256 	bch2_bkey_ptrs_to_text(out, c, k);
257 }
258 
bch2_btree_ptr_v2_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)259 int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
260 			       struct bkey_validate_context from)
261 {
262 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
263 	int ret = 0;
264 
265 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
266 			 c, btree_ptr_v2_val_too_big,
267 			 "value too big (%zu > %zu)",
268 			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
269 
270 	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
271 			 c, btree_ptr_v2_min_key_bad,
272 			 "min_key > key");
273 
274 	if ((from.flags & BCH_VALIDATE_write) &&
275 	    c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
276 		bkey_fsck_err_on(!bp.v->sectors_written,
277 				 c, btree_ptr_v2_written_0,
278 				 "sectors_written == 0");
279 
280 	ret = bch2_bkey_ptrs_validate(c, k, from);
281 fsck_err:
282 	return ret;
283 }
284 
bch2_btree_ptr_v2_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)285 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
286 			       struct bkey_s_c k)
287 {
288 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
289 
290 	prt_printf(out, "seq %llx written %u min_key %s",
291 	       le64_to_cpu(bp.v->seq),
292 	       le16_to_cpu(bp.v->sectors_written),
293 	       BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
294 
295 	bch2_bpos_to_text(out, bp.v->min_key);
296 	prt_printf(out, " ");
297 	bch2_bkey_ptrs_to_text(out, c, k);
298 }
299 
bch2_btree_ptr_v2_compat(enum btree_id btree_id,unsigned version,unsigned big_endian,int write,struct bkey_s k)300 void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
301 			      unsigned big_endian, int write,
302 			      struct bkey_s k)
303 {
304 	struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
305 
306 	compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
307 
308 	if (version < bcachefs_metadata_version_inode_btree_change &&
309 	    btree_id_is_extents(btree_id) &&
310 	    !bkey_eq(bp.v->min_key, POS_MIN))
311 		bp.v->min_key = write
312 			? bpos_nosnap_predecessor(bp.v->min_key)
313 			: bpos_nosnap_successor(bp.v->min_key);
314 }
315 
316 /* KEY_TYPE_extent: */
317 
bch2_extent_merge(struct bch_fs * c,struct bkey_s l,struct bkey_s_c r)318 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
319 {
320 	struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
321 	struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r);
322 	union bch_extent_entry *en_l;
323 	const union bch_extent_entry *en_r;
324 	struct extent_ptr_decoded lp, rp;
325 	bool use_right_ptr;
326 
327 	en_l = l_ptrs.start;
328 	en_r = r_ptrs.start;
329 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
330 		if (extent_entry_type(en_l) != extent_entry_type(en_r))
331 			return false;
332 
333 		en_l = extent_entry_next(en_l);
334 		en_r = extent_entry_next(en_r);
335 	}
336 
337 	if (en_l < l_ptrs.end || en_r < r_ptrs.end)
338 		return false;
339 
340 	en_l = l_ptrs.start;
341 	en_r = r_ptrs.start;
342 	lp.crc = bch2_extent_crc_unpack(l.k, NULL);
343 	rp.crc = bch2_extent_crc_unpack(r.k, NULL);
344 
345 	while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
346 	       __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
347 		if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
348 		    rp.ptr.offset + rp.crc.offset ||
349 		    lp.ptr.dev			!= rp.ptr.dev ||
350 		    lp.ptr.gen			!= rp.ptr.gen ||
351 		    lp.ptr.unwritten		!= rp.ptr.unwritten ||
352 		    lp.has_ec			!= rp.has_ec)
353 			return false;
354 
355 		/* Extents may not straddle buckets: */
356 		rcu_read_lock();
357 		struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
358 		bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
359 		rcu_read_unlock();
360 
361 		if (!same_bucket)
362 			return false;
363 
364 		if (lp.has_ec			!= rp.has_ec ||
365 		    (lp.has_ec &&
366 		     (lp.ec.block		!= rp.ec.block ||
367 		      lp.ec.redundancy		!= rp.ec.redundancy ||
368 		      lp.ec.idx			!= rp.ec.idx)))
369 			return false;
370 
371 		if (lp.crc.compression_type	!= rp.crc.compression_type ||
372 		    lp.crc.nonce		!= rp.crc.nonce)
373 			return false;
374 
375 		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
376 		    lp.crc.uncompressed_size) {
377 			/* can use left extent's crc entry */
378 		} else if (lp.crc.live_size <= rp.crc.offset) {
379 			/* can use right extent's crc entry */
380 		} else {
381 			/* check if checksums can be merged: */
382 			if (lp.crc.csum_type		!= rp.crc.csum_type ||
383 			    lp.crc.nonce		!= rp.crc.nonce ||
384 			    crc_is_compressed(lp.crc) ||
385 			    !bch2_checksum_mergeable(lp.crc.csum_type))
386 				return false;
387 
388 			if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
389 			    rp.crc.offset)
390 				return false;
391 
392 			if (lp.crc.csum_type &&
393 			    lp.crc.uncompressed_size +
394 			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
395 				return false;
396 		}
397 
398 		en_l = extent_entry_next(en_l);
399 		en_r = extent_entry_next(en_r);
400 	}
401 
402 	en_l = l_ptrs.start;
403 	en_r = r_ptrs.start;
404 	while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
405 		if (extent_entry_is_crc(en_l)) {
406 			struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
407 			struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
408 
409 			if (crc_l.uncompressed_size + crc_r.uncompressed_size >
410 			    bch2_crc_field_size_max[extent_entry_type(en_l)])
411 				return false;
412 		}
413 
414 		en_l = extent_entry_next(en_l);
415 		en_r = extent_entry_next(en_r);
416 	}
417 
418 	use_right_ptr = false;
419 	en_l = l_ptrs.start;
420 	en_r = r_ptrs.start;
421 	while (en_l < l_ptrs.end) {
422 		if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr &&
423 		    use_right_ptr)
424 			en_l->ptr = en_r->ptr;
425 
426 		if (extent_entry_is_crc(en_l)) {
427 			struct bch_extent_crc_unpacked crc_l =
428 				bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
429 			struct bch_extent_crc_unpacked crc_r =
430 				bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
431 
432 			use_right_ptr = false;
433 
434 			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
435 			    crc_l.uncompressed_size) {
436 				/* can use left extent's crc entry */
437 			} else if (crc_l.live_size <= crc_r.offset) {
438 				/* can use right extent's crc entry */
439 				crc_r.offset -= crc_l.live_size;
440 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
441 						     extent_entry_type(en_l));
442 				use_right_ptr = true;
443 			} else {
444 				crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
445 								 crc_l.csum,
446 								 crc_r.csum,
447 								 crc_r.uncompressed_size << 9);
448 
449 				crc_l.uncompressed_size	+= crc_r.uncompressed_size;
450 				crc_l.compressed_size	+= crc_r.compressed_size;
451 				bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
452 						     extent_entry_type(en_l));
453 			}
454 		}
455 
456 		en_l = extent_entry_next(en_l);
457 		en_r = extent_entry_next(en_r);
458 	}
459 
460 	bch2_key_resize(l.k, l.k->size + r.k->size);
461 	return true;
462 }
463 
464 /* KEY_TYPE_reservation: */
465 
bch2_reservation_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)466 int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
467 			      struct bkey_validate_context from)
468 {
469 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
470 	int ret = 0;
471 
472 	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
473 			 c, reservation_key_nr_replicas_invalid,
474 			 "invalid nr_replicas (%u)", r.v->nr_replicas);
475 fsck_err:
476 	return ret;
477 }
478 
bch2_reservation_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)479 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
480 			      struct bkey_s_c k)
481 {
482 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
483 
484 	prt_printf(out, "generation %u replicas %u",
485 	       le32_to_cpu(r.v->generation),
486 	       r.v->nr_replicas);
487 }
488 
bch2_reservation_merge(struct bch_fs * c,struct bkey_s _l,struct bkey_s_c _r)489 bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
490 {
491 	struct bkey_s_reservation l = bkey_s_to_reservation(_l);
492 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r);
493 
494 	if (l.v->generation != r.v->generation ||
495 	    l.v->nr_replicas != r.v->nr_replicas)
496 		return false;
497 
498 	bch2_key_resize(l.k, l.k->size + r.k->size);
499 	return true;
500 }
501 
502 /* Extent checksum entries: */
503 
504 /* returns true if not equal */
bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,struct bch_extent_crc_unpacked r)505 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
506 					 struct bch_extent_crc_unpacked r)
507 {
508 	return (l.csum_type		!= r.csum_type ||
509 		l.compression_type	!= r.compression_type ||
510 		l.compressed_size	!= r.compressed_size ||
511 		l.uncompressed_size	!= r.uncompressed_size ||
512 		l.offset		!= r.offset ||
513 		l.live_size		!= r.live_size ||
514 		l.nonce			!= r.nonce ||
515 		bch2_crc_cmp(l.csum, r.csum));
516 }
517 
can_narrow_crc(struct bch_extent_crc_unpacked u,struct bch_extent_crc_unpacked n)518 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
519 				  struct bch_extent_crc_unpacked n)
520 {
521 	return !crc_is_compressed(u) &&
522 		u.csum_type &&
523 		u.uncompressed_size > u.live_size &&
524 		bch2_csum_type_is_encryption(u.csum_type) ==
525 		bch2_csum_type_is_encryption(n.csum_type);
526 }
527 
bch2_can_narrow_extent_crcs(struct bkey_s_c k,struct bch_extent_crc_unpacked n)528 bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
529 				 struct bch_extent_crc_unpacked n)
530 {
531 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
532 	struct bch_extent_crc_unpacked crc;
533 	const union bch_extent_entry *i;
534 
535 	if (!n.csum_type)
536 		return false;
537 
538 	bkey_for_each_crc(k.k, ptrs, crc, i)
539 		if (can_narrow_crc(crc, n))
540 			return true;
541 
542 	return false;
543 }
544 
545 /*
546  * We're writing another replica for this extent, so while we've got the data in
547  * memory we'll be computing a new checksum for the currently live data.
548  *
549  * If there are other replicas we aren't moving, and they are checksummed but
550  * not compressed, we can modify them to point to only the data that is
551  * currently live (so that readers won't have to bounce) while we've got the
552  * checksum we need:
553  */
bch2_bkey_narrow_crcs(struct bkey_i * k,struct bch_extent_crc_unpacked n)554 bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
555 {
556 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
557 	struct bch_extent_crc_unpacked u;
558 	struct extent_ptr_decoded p;
559 	union bch_extent_entry *i;
560 	bool ret = false;
561 
562 	/* Find a checksum entry that covers only live data: */
563 	if (!n.csum_type) {
564 		bkey_for_each_crc(&k->k, ptrs, u, i)
565 			if (!crc_is_compressed(u) &&
566 			    u.csum_type &&
567 			    u.live_size == u.uncompressed_size) {
568 				n = u;
569 				goto found;
570 			}
571 		return false;
572 	}
573 found:
574 	BUG_ON(crc_is_compressed(n));
575 	BUG_ON(n.offset);
576 	BUG_ON(n.live_size != k->k.size);
577 
578 restart_narrow_pointers:
579 	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
580 
581 	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
582 		if (can_narrow_crc(p.crc, n)) {
583 			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
584 			p.ptr.offset += p.crc.offset;
585 			p.crc = n;
586 			bch2_extent_ptr_decoded_append(k, &p);
587 			ret = true;
588 			goto restart_narrow_pointers;
589 		}
590 
591 	return ret;
592 }
593 
bch2_extent_crc_pack(union bch_extent_crc * dst,struct bch_extent_crc_unpacked src,enum bch_extent_entry_type type)594 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
595 				 struct bch_extent_crc_unpacked src,
596 				 enum bch_extent_entry_type type)
597 {
598 #define common_fields(_src)						\
599 		.type			= BIT(type),			\
600 		.csum_type		= _src.csum_type,		\
601 		.compression_type	= _src.compression_type,	\
602 		._compressed_size	= _src.compressed_size - 1,	\
603 		._uncompressed_size	= _src.uncompressed_size - 1,	\
604 		.offset			= _src.offset
605 
606 	switch (type) {
607 	case BCH_EXTENT_ENTRY_crc32:
608 		dst->crc32		= (struct bch_extent_crc32) {
609 			common_fields(src),
610 			.csum		= (u32 __force) *((__le32 *) &src.csum.lo),
611 		};
612 		break;
613 	case BCH_EXTENT_ENTRY_crc64:
614 		dst->crc64		= (struct bch_extent_crc64) {
615 			common_fields(src),
616 			.nonce		= src.nonce,
617 			.csum_lo	= (u64 __force) src.csum.lo,
618 			.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi),
619 		};
620 		break;
621 	case BCH_EXTENT_ENTRY_crc128:
622 		dst->crc128		= (struct bch_extent_crc128) {
623 			common_fields(src),
624 			.nonce		= src.nonce,
625 			.csum		= src.csum,
626 		};
627 		break;
628 	default:
629 		BUG();
630 	}
631 #undef set_common_fields
632 }
633 
bch2_extent_crc_append(struct bkey_i * k,struct bch_extent_crc_unpacked new)634 void bch2_extent_crc_append(struct bkey_i *k,
635 			    struct bch_extent_crc_unpacked new)
636 {
637 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
638 	union bch_extent_crc *crc = (void *) ptrs.end;
639 	enum bch_extent_entry_type type;
640 
641 	if (bch_crc_bytes[new.csum_type]	<= 4 &&
642 	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
643 	    new.nonce				<= CRC32_NONCE_MAX)
644 		type = BCH_EXTENT_ENTRY_crc32;
645 	else if (bch_crc_bytes[new.csum_type]	<= 10 &&
646 		   new.uncompressed_size	<= CRC64_SIZE_MAX &&
647 		   new.nonce			<= CRC64_NONCE_MAX)
648 		type = BCH_EXTENT_ENTRY_crc64;
649 	else if (bch_crc_bytes[new.csum_type]	<= 16 &&
650 		   new.uncompressed_size	<= CRC128_SIZE_MAX &&
651 		   new.nonce			<= CRC128_NONCE_MAX)
652 		type = BCH_EXTENT_ENTRY_crc128;
653 	else
654 		BUG();
655 
656 	bch2_extent_crc_pack(crc, new, type);
657 
658 	k->k.u64s += extent_entry_u64s(ptrs.end);
659 
660 	EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
661 }
662 
663 /* Generic code for keys with pointers: */
664 
bch2_bkey_nr_ptrs(struct bkey_s_c k)665 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
666 {
667 	return bch2_bkey_devs(k).nr;
668 }
669 
bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)670 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
671 {
672 	return k.k->type == KEY_TYPE_reservation
673 		? bkey_s_c_to_reservation(k).v->nr_replicas
674 		: bch2_bkey_dirty_devs(k).nr;
675 }
676 
bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)677 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
678 {
679 	unsigned ret = 0;
680 
681 	if (k.k->type == KEY_TYPE_reservation) {
682 		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
683 	} else {
684 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
685 		const union bch_extent_entry *entry;
686 		struct extent_ptr_decoded p;
687 
688 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
689 			ret += !p.ptr.cached && !crc_is_compressed(p.crc);
690 	}
691 
692 	return ret;
693 }
694 
bch2_bkey_sectors_compressed(struct bkey_s_c k)695 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
696 {
697 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
698 	const union bch_extent_entry *entry;
699 	struct extent_ptr_decoded p;
700 	unsigned ret = 0;
701 
702 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
703 		if (!p.ptr.cached && crc_is_compressed(p.crc))
704 			ret += p.crc.compressed_size;
705 
706 	return ret;
707 }
708 
bch2_bkey_is_incompressible(struct bkey_s_c k)709 bool bch2_bkey_is_incompressible(struct bkey_s_c k)
710 {
711 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
712 	const union bch_extent_entry *entry;
713 	struct bch_extent_crc_unpacked crc;
714 
715 	bkey_for_each_crc(k.k, ptrs, crc, entry)
716 		if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
717 			return true;
718 	return false;
719 }
720 
bch2_bkey_replicas(struct bch_fs * c,struct bkey_s_c k)721 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
722 {
723 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
724 	const union bch_extent_entry *entry;
725 	struct extent_ptr_decoded p = { 0 };
726 	unsigned replicas = 0;
727 
728 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
729 		if (p.ptr.cached)
730 			continue;
731 
732 		if (p.has_ec)
733 			replicas += p.ec.redundancy;
734 
735 		replicas++;
736 
737 	}
738 
739 	return replicas;
740 }
741 
__extent_ptr_durability(struct bch_dev * ca,struct extent_ptr_decoded * p)742 static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
743 {
744 	if (p->ptr.cached)
745 		return 0;
746 
747 	return p->has_ec
748 		? p->ec.redundancy + 1
749 		: ca->mi.durability;
750 }
751 
bch2_extent_ptr_desired_durability(struct bch_fs * c,struct extent_ptr_decoded * p)752 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
753 {
754 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
755 
756 	return ca ? __extent_ptr_durability(ca, p) : 0;
757 }
758 
bch2_extent_ptr_durability(struct bch_fs * c,struct extent_ptr_decoded * p)759 unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
760 {
761 	struct bch_dev *ca = bch2_dev_rcu(c, p->ptr.dev);
762 
763 	if (!ca || ca->mi.state == BCH_MEMBER_STATE_failed)
764 		return 0;
765 
766 	return __extent_ptr_durability(ca, p);
767 }
768 
bch2_bkey_durability(struct bch_fs * c,struct bkey_s_c k)769 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
770 {
771 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
772 	const union bch_extent_entry *entry;
773 	struct extent_ptr_decoded p;
774 	unsigned durability = 0;
775 
776 	rcu_read_lock();
777 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
778 		durability += bch2_extent_ptr_durability(c, &p);
779 	rcu_read_unlock();
780 
781 	return durability;
782 }
783 
bch2_bkey_durability_safe(struct bch_fs * c,struct bkey_s_c k)784 static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
785 {
786 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
787 	const union bch_extent_entry *entry;
788 	struct extent_ptr_decoded p;
789 	unsigned durability = 0;
790 
791 	rcu_read_lock();
792 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
793 		if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
794 			durability += bch2_extent_ptr_durability(c, &p);
795 	rcu_read_unlock();
796 
797 	return durability;
798 }
799 
bch2_bkey_extent_entry_drop(struct bkey_i * k,union bch_extent_entry * entry)800 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
801 {
802 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
803 	union bch_extent_entry *next = extent_entry_next(entry);
804 
805 	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
806 	k->k.u64s -= extent_entry_u64s(entry);
807 }
808 
bch2_extent_ptr_decoded_append(struct bkey_i * k,struct extent_ptr_decoded * p)809 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
810 				    struct extent_ptr_decoded *p)
811 {
812 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
813 	struct bch_extent_crc_unpacked crc =
814 		bch2_extent_crc_unpack(&k->k, NULL);
815 	union bch_extent_entry *pos;
816 
817 	if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
818 		pos = ptrs.start;
819 		goto found;
820 	}
821 
822 	bkey_for_each_crc(&k->k, ptrs, crc, pos)
823 		if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
824 			pos = extent_entry_next(pos);
825 			goto found;
826 		}
827 
828 	bch2_extent_crc_append(k, p->crc);
829 	pos = bkey_val_end(bkey_i_to_s(k));
830 found:
831 	p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
832 	__extent_entry_insert(k, pos, to_entry(&p->ptr));
833 
834 	if (p->has_ec) {
835 		p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
836 		__extent_entry_insert(k, pos, to_entry(&p->ec));
837 	}
838 }
839 
extent_entry_prev(struct bkey_ptrs ptrs,union bch_extent_entry * entry)840 static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
841 					  union bch_extent_entry *entry)
842 {
843 	union bch_extent_entry *i = ptrs.start;
844 
845 	if (i == entry)
846 		return NULL;
847 
848 	while (extent_entry_next(i) != entry)
849 		i = extent_entry_next(i);
850 	return i;
851 }
852 
853 /*
854  * Returns pointer to the next entry after the one being dropped:
855  */
bch2_bkey_drop_ptr_noerror(struct bkey_s k,struct bch_extent_ptr * ptr)856 void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
857 {
858 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
859 	union bch_extent_entry *entry = to_entry(ptr), *next;
860 	bool drop_crc = true;
861 
862 	if (k.k->type == KEY_TYPE_stripe) {
863 		ptr->dev = BCH_SB_MEMBER_INVALID;
864 		return;
865 	}
866 
867 	EBUG_ON(ptr < &ptrs.start->ptr ||
868 		ptr >= &ptrs.end->ptr);
869 	EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
870 
871 	for (next = extent_entry_next(entry);
872 	     next != ptrs.end;
873 	     next = extent_entry_next(next)) {
874 		if (extent_entry_is_crc(next)) {
875 			break;
876 		} else if (extent_entry_is_ptr(next)) {
877 			drop_crc = false;
878 			break;
879 		}
880 	}
881 
882 	extent_entry_drop(k, entry);
883 
884 	while ((entry = extent_entry_prev(ptrs, entry))) {
885 		if (extent_entry_is_ptr(entry))
886 			break;
887 
888 		if ((extent_entry_is_crc(entry) && drop_crc) ||
889 		    extent_entry_is_stripe_ptr(entry))
890 			extent_entry_drop(k, entry);
891 	}
892 }
893 
bch2_bkey_drop_ptr(struct bkey_s k,struct bch_extent_ptr * ptr)894 void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr)
895 {
896 	if (k.k->type != KEY_TYPE_stripe) {
897 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c);
898 		const union bch_extent_entry *entry;
899 		struct extent_ptr_decoded p;
900 
901 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
902 			if (p.ptr.dev == ptr->dev && p.has_ec) {
903 				ptr->dev = BCH_SB_MEMBER_INVALID;
904 				return;
905 			}
906 	}
907 
908 	bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
909 
910 	bch2_bkey_drop_ptr_noerror(k, ptr);
911 
912 	/*
913 	 * If we deleted all the dirty pointers and there's still cached
914 	 * pointers, we could set the cached pointers to dirty if they're not
915 	 * stale - but to do that correctly we'd need to grab an open_bucket
916 	 * reference so that we don't race with bucket reuse:
917 	 */
918 	if (have_dirty &&
919 	    !bch2_bkey_dirty_devs(k.s_c).nr) {
920 		k.k->type = KEY_TYPE_error;
921 		set_bkey_val_u64s(k.k, 0);
922 	} else if (!bch2_bkey_nr_ptrs(k.s_c)) {
923 		k.k->type = KEY_TYPE_deleted;
924 		set_bkey_val_u64s(k.k, 0);
925 	}
926 }
927 
bch2_bkey_drop_device(struct bkey_s k,unsigned dev)928 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
929 {
930 	bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
931 }
932 
bch2_bkey_drop_device_noerror(struct bkey_s k,unsigned dev)933 void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
934 {
935 	bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev);
936 }
937 
bch2_bkey_has_device_c(struct bkey_s_c k,unsigned dev)938 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
939 {
940 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
941 
942 	bkey_for_each_ptr(ptrs, ptr)
943 		if (ptr->dev == dev)
944 			return ptr;
945 
946 	return NULL;
947 }
948 
bch2_bkey_has_target(struct bch_fs * c,struct bkey_s_c k,unsigned target)949 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
950 {
951 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
952 	struct bch_dev *ca;
953 	bool ret = false;
954 
955 	rcu_read_lock();
956 	bkey_for_each_ptr(ptrs, ptr)
957 		if (bch2_dev_in_target(c, ptr->dev, target) &&
958 		    (ca = bch2_dev_rcu(c, ptr->dev)) &&
959 		    (!ptr->cached ||
960 		     !dev_ptr_stale_rcu(ca, ptr))) {
961 			ret = true;
962 			break;
963 		}
964 	rcu_read_unlock();
965 
966 	return ret;
967 }
968 
bch2_bkey_matches_ptr(struct bch_fs * c,struct bkey_s_c k,struct bch_extent_ptr m,u64 offset)969 bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
970 			   struct bch_extent_ptr m, u64 offset)
971 {
972 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
973 	const union bch_extent_entry *entry;
974 	struct extent_ptr_decoded p;
975 
976 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
977 		if (p.ptr.dev	== m.dev &&
978 		    p.ptr.gen	== m.gen &&
979 		    (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
980 		    (s64) m.offset  - offset)
981 			return true;
982 
983 	return false;
984 }
985 
986 /*
987  * Returns true if two extents refer to the same data:
988  */
bch2_extents_match(struct bkey_s_c k1,struct bkey_s_c k2)989 bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
990 {
991 	if (k1.k->type != k2.k->type)
992 		return false;
993 
994 	if (bkey_extent_is_direct_data(k1.k)) {
995 		struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
996 		struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
997 		const union bch_extent_entry *entry1, *entry2;
998 		struct extent_ptr_decoded p1, p2;
999 
1000 		if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
1001 			return false;
1002 
1003 		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
1004 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1005 				if (p1.ptr.dev		== p2.ptr.dev &&
1006 				    p1.ptr.gen		== p2.ptr.gen &&
1007 
1008 				    /*
1009 				     * This checks that the two pointers point
1010 				     * to the same region on disk - adjusting
1011 				     * for the difference in where the extents
1012 				     * start, since one may have been trimmed:
1013 				     */
1014 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1015 				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
1016 
1017 				    /*
1018 				     * This additionally checks that the
1019 				     * extents overlap on disk, since the
1020 				     * previous check may trigger spuriously
1021 				     * when one extent is immediately partially
1022 				     * overwritten with another extent (so that
1023 				     * on disk they are adjacent) and
1024 				     * compression is in use:
1025 				     */
1026 				    ((p1.ptr.offset >= p2.ptr.offset &&
1027 				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
1028 				     (p2.ptr.offset >= p1.ptr.offset &&
1029 				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
1030 					return true;
1031 
1032 		return false;
1033 	} else {
1034 		/* KEY_TYPE_deleted, etc. */
1035 		return true;
1036 	}
1037 }
1038 
1039 struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c k1,struct extent_ptr_decoded p1,struct bkey_s k2)1040 bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
1041 {
1042 	struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
1043 	union bch_extent_entry *entry2;
1044 	struct extent_ptr_decoded p2;
1045 
1046 	bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
1047 		if (p1.ptr.dev		== p2.ptr.dev &&
1048 		    p1.ptr.gen		== p2.ptr.gen &&
1049 		    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
1050 		    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
1051 			return &entry2->ptr;
1052 
1053 	return NULL;
1054 }
1055 
want_cached_ptr(struct bch_fs * c,struct bch_io_opts * opts,struct bch_extent_ptr * ptr)1056 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
1057 			    struct bch_extent_ptr *ptr)
1058 {
1059 	unsigned target = opts->promote_target ?: opts->foreground_target;
1060 
1061 	if (target && !bch2_dev_in_target(c, ptr->dev, target))
1062 		return false;
1063 
1064 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1065 
1066 	return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
1067 }
1068 
bch2_extent_ptr_set_cached(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k,struct bch_extent_ptr * ptr)1069 void bch2_extent_ptr_set_cached(struct bch_fs *c,
1070 				struct bch_io_opts *opts,
1071 				struct bkey_s k,
1072 				struct bch_extent_ptr *ptr)
1073 {
1074 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1075 	union bch_extent_entry *entry;
1076 	struct extent_ptr_decoded p;
1077 
1078 	rcu_read_lock();
1079 	if (!want_cached_ptr(c, opts, ptr)) {
1080 		bch2_bkey_drop_ptr_noerror(k, ptr);
1081 		goto out;
1082 	}
1083 
1084 	/*
1085 	 * Stripes can't contain cached data, for - reasons.
1086 	 *
1087 	 * Possibly something we can fix in the future?
1088 	 */
1089 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1090 		if (&entry->ptr == ptr) {
1091 			if (p.has_ec)
1092 				bch2_bkey_drop_ptr_noerror(k, ptr);
1093 			else
1094 				ptr->cached = true;
1095 			goto out;
1096 		}
1097 
1098 	BUG();
1099 out:
1100 	rcu_read_unlock();
1101 }
1102 
1103 /*
1104  * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
1105  *
1106  * Returns true if @k should be dropped entirely
1107  *
1108  * For existing keys, only called when btree nodes are being rewritten, not when
1109  * they're merely being compacted/resorted in memory.
1110  */
bch2_extent_normalize(struct bch_fs * c,struct bkey_s k)1111 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1112 {
1113 	struct bch_dev *ca;
1114 
1115 	rcu_read_lock();
1116 	bch2_bkey_drop_ptrs(k, ptr,
1117 		ptr->cached &&
1118 		(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
1119 		 dev_ptr_stale_rcu(ca, ptr) > 0));
1120 	rcu_read_unlock();
1121 
1122 	return bkey_deleted(k.k);
1123 }
1124 
1125 /*
1126  * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
1127  *
1128  * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
1129  * the promote target.
1130  */
bch2_extent_normalize_by_opts(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s k)1131 bool bch2_extent_normalize_by_opts(struct bch_fs *c,
1132 				   struct bch_io_opts *opts,
1133 				   struct bkey_s k)
1134 {
1135 	struct bkey_ptrs ptrs;
1136 	bool have_cached_ptr;
1137 
1138 	rcu_read_lock();
1139 restart_drop_ptrs:
1140 	ptrs = bch2_bkey_ptrs(k);
1141 	have_cached_ptr = false;
1142 
1143 	bkey_for_each_ptr(ptrs, ptr)
1144 		if (ptr->cached) {
1145 			if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
1146 				bch2_bkey_drop_ptr(k, ptr);
1147 				goto restart_drop_ptrs;
1148 			}
1149 			have_cached_ptr = true;
1150 		}
1151 	rcu_read_unlock();
1152 
1153 	return bkey_deleted(k.k);
1154 }
1155 
bch2_extent_ptr_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_ptr * ptr)1156 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
1157 {
1158 	out->atomic++;
1159 	rcu_read_lock();
1160 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1161 	if (!ca) {
1162 		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
1163 			   (u64) ptr->offset, ptr->gen,
1164 			   ptr->cached ? " cached" : "");
1165 	} else {
1166 		u32 offset;
1167 		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
1168 
1169 		prt_printf(out, "ptr: %u:%llu:%u gen %u",
1170 			   ptr->dev, b, offset, ptr->gen);
1171 		if (ca->mi.durability != 1)
1172 			prt_printf(out, " d=%u", ca->mi.durability);
1173 		if (ptr->cached)
1174 			prt_str(out, " cached");
1175 		if (ptr->unwritten)
1176 			prt_str(out, " unwritten");
1177 		int stale = dev_ptr_stale_rcu(ca, ptr);
1178 		if (stale > 0)
1179 			prt_printf(out, " stale");
1180 		else if (stale)
1181 			prt_printf(out, " invalid");
1182 	}
1183 	rcu_read_unlock();
1184 	--out->atomic;
1185 }
1186 
bch2_extent_crc_unpacked_to_text(struct printbuf * out,struct bch_extent_crc_unpacked * crc)1187 void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc)
1188 {
1189 	prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
1190 		   crc->compressed_size,
1191 		   crc->uncompressed_size,
1192 		   crc->offset, crc->nonce);
1193 	bch2_prt_csum_type(out, crc->csum_type);
1194 	prt_printf(out, " %0llx:%0llx ", crc->csum.hi, crc->csum.lo);
1195 	prt_str(out, " compress ");
1196 	bch2_prt_compression_type(out, crc->compression_type);
1197 }
1198 
bch2_extent_rebalance_to_text(struct printbuf * out,struct bch_fs * c,const struct bch_extent_rebalance * r)1199 static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
1200 					  const struct bch_extent_rebalance *r)
1201 {
1202 	prt_str(out, "rebalance:");
1203 
1204 	prt_printf(out, " replicas=%u", r->data_replicas);
1205 	if (r->data_replicas_from_inode)
1206 		prt_str(out, " (inode)");
1207 
1208 	prt_str(out, " checksum=");
1209 	bch2_prt_csum_opt(out, r->data_checksum);
1210 	if (r->data_checksum_from_inode)
1211 		prt_str(out, " (inode)");
1212 
1213 	if (r->background_compression || r->background_compression_from_inode) {
1214 		prt_str(out, " background_compression=");
1215 		bch2_compression_opt_to_text(out, r->background_compression);
1216 
1217 		if (r->background_compression_from_inode)
1218 			prt_str(out, " (inode)");
1219 	}
1220 
1221 	if (r->background_target || r->background_target_from_inode) {
1222 		prt_str(out, " background_target=");
1223 		if (c)
1224 			bch2_target_to_text(out, c, r->background_target);
1225 		else
1226 			prt_printf(out, "%u", r->background_target);
1227 
1228 		if (r->background_target_from_inode)
1229 			prt_str(out, " (inode)");
1230 	}
1231 
1232 	if (r->promote_target || r->promote_target_from_inode) {
1233 		prt_str(out, " promote_target=");
1234 		if (c)
1235 			bch2_target_to_text(out, c, r->promote_target);
1236 		else
1237 			prt_printf(out, "%u", r->promote_target);
1238 
1239 		if (r->promote_target_from_inode)
1240 			prt_str(out, " (inode)");
1241 	}
1242 
1243 	if (r->erasure_code || r->erasure_code_from_inode) {
1244 		prt_printf(out, " ec=%u", r->erasure_code);
1245 		if (r->erasure_code_from_inode)
1246 			prt_str(out, " (inode)");
1247 	}
1248 }
1249 
bch2_bkey_ptrs_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)1250 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1251 			    struct bkey_s_c k)
1252 {
1253 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1254 	const union bch_extent_entry *entry;
1255 	bool first = true;
1256 
1257 	if (c)
1258 		prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
1259 
1260 	bkey_extent_entry_for_each(ptrs, entry) {
1261 		if (!first)
1262 			prt_printf(out, " ");
1263 
1264 		switch (__extent_entry_type(entry)) {
1265 		case BCH_EXTENT_ENTRY_ptr:
1266 			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
1267 			break;
1268 
1269 		case BCH_EXTENT_ENTRY_crc32:
1270 		case BCH_EXTENT_ENTRY_crc64:
1271 		case BCH_EXTENT_ENTRY_crc128: {
1272 			struct bch_extent_crc_unpacked crc =
1273 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1274 
1275 			bch2_extent_crc_unpacked_to_text(out, &crc);
1276 			break;
1277 		}
1278 		case BCH_EXTENT_ENTRY_stripe_ptr: {
1279 			const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
1280 
1281 			prt_printf(out, "ec: idx %llu block %u",
1282 			       (u64) ec->idx, ec->block);
1283 			break;
1284 		}
1285 		case BCH_EXTENT_ENTRY_rebalance:
1286 			bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
1287 			break;
1288 
1289 		case BCH_EXTENT_ENTRY_flags:
1290 			prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
1291 			break;
1292 
1293 		default:
1294 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
1295 			return;
1296 		}
1297 
1298 		first = false;
1299 	}
1300 }
1301 
extent_ptr_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from,const struct bch_extent_ptr * ptr,unsigned size_ondisk,bool metadata)1302 static int extent_ptr_validate(struct bch_fs *c,
1303 			       struct bkey_s_c k,
1304 			       struct bkey_validate_context from,
1305 			       const struct bch_extent_ptr *ptr,
1306 			       unsigned size_ondisk,
1307 			       bool metadata)
1308 {
1309 	int ret = 0;
1310 
1311 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1312 	bkey_for_each_ptr(ptrs, ptr2)
1313 		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
1314 				 c, ptr_to_duplicate_device,
1315 				 "multiple pointers to same device (%u)", ptr->dev);
1316 
1317 	/* bad pointers are repaired by check_fix_ptrs(): */
1318 	rcu_read_lock();
1319 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
1320 	if (!ca) {
1321 		rcu_read_unlock();
1322 		return 0;
1323 	}
1324 	u32 bucket_offset;
1325 	u64 bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
1326 	unsigned first_bucket	= ca->mi.first_bucket;
1327 	u64 nbuckets		= ca->mi.nbuckets;
1328 	unsigned bucket_size	= ca->mi.bucket_size;
1329 	rcu_read_unlock();
1330 
1331 	bkey_fsck_err_on(bucket >= nbuckets,
1332 			 c, ptr_after_last_bucket,
1333 			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
1334 	bkey_fsck_err_on(bucket < first_bucket,
1335 			 c, ptr_before_first_bucket,
1336 			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
1337 	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
1338 			 c, ptr_spans_multiple_buckets,
1339 			 "pointer spans multiple buckets (%u + %u > %u)",
1340 		       bucket_offset, size_ondisk, bucket_size);
1341 fsck_err:
1342 	return ret;
1343 }
1344 
bch2_bkey_ptrs_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)1345 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
1346 			    struct bkey_validate_context from)
1347 {
1348 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1349 	const union bch_extent_entry *entry;
1350 	struct bch_extent_crc_unpacked crc;
1351 	unsigned size_ondisk = k.k->size;
1352 	unsigned nonce = UINT_MAX;
1353 	unsigned nr_ptrs = 0;
1354 	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
1355 	int ret = 0;
1356 
1357 	if (bkey_is_btree_ptr(k.k))
1358 		size_ondisk = btree_sectors(c);
1359 
1360 	bkey_extent_entry_for_each(ptrs, entry) {
1361 		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
1362 				 c, extent_ptrs_invalid_entry,
1363 				 "invalid extent entry type (got %u, max %u)",
1364 				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
1365 
1366 		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
1367 				 !extent_entry_is_ptr(entry),
1368 				 c, btree_ptr_has_non_ptr,
1369 				 "has non ptr field");
1370 
1371 		switch (extent_entry_type(entry)) {
1372 		case BCH_EXTENT_ENTRY_ptr:
1373 			ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
1374 			if (ret)
1375 				return ret;
1376 
1377 			bkey_fsck_err_on(entry->ptr.cached && have_ec,
1378 					 c, ptr_cached_and_erasure_coded,
1379 					 "cached, erasure coded ptr");
1380 
1381 			if (!entry->ptr.unwritten)
1382 				have_written = true;
1383 			else
1384 				have_unwritten = true;
1385 
1386 			have_ec = false;
1387 			crc_since_last_ptr = false;
1388 			nr_ptrs++;
1389 			break;
1390 		case BCH_EXTENT_ENTRY_crc32:
1391 		case BCH_EXTENT_ENTRY_crc64:
1392 		case BCH_EXTENT_ENTRY_crc128:
1393 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
1394 
1395 			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
1396 					 c, ptr_crc_csum_type_unknown,
1397 					 "invalid checksum type");
1398 			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
1399 					 c, ptr_crc_compression_type_unknown,
1400 					 "invalid compression type");
1401 
1402 			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
1403 					 c, ptr_crc_uncompressed_size_too_small,
1404 					 "checksum offset + key size > uncompressed size");
1405 			bkey_fsck_err_on(crc_is_encoded(crc) &&
1406 					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
1407 					 (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
1408 					 c, ptr_crc_uncompressed_size_too_big,
1409 					 "too large encoded extent");
1410 			bkey_fsck_err_on(!crc_is_compressed(crc) &&
1411 					 crc.compressed_size != crc.uncompressed_size,
1412 					 c, ptr_crc_uncompressed_size_mismatch,
1413 					 "not compressed but compressed != uncompressed size");
1414 
1415 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
1416 				if (nonce == UINT_MAX)
1417 					nonce = crc.offset + crc.nonce;
1418 				else if (nonce != crc.offset + crc.nonce)
1419 					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
1420 						      "incorrect nonce");
1421 			}
1422 
1423 			bkey_fsck_err_on(crc_since_last_ptr,
1424 					 c, ptr_crc_redundant,
1425 					 "redundant crc entry");
1426 			crc_since_last_ptr = true;
1427 
1428 			size_ondisk = crc.compressed_size;
1429 			break;
1430 		case BCH_EXTENT_ENTRY_stripe_ptr:
1431 			bkey_fsck_err_on(have_ec,
1432 					 c, ptr_stripe_redundant,
1433 					 "redundant stripe entry");
1434 			have_ec = true;
1435 			break;
1436 		case BCH_EXTENT_ENTRY_rebalance: {
1437 			/*
1438 			 * this shouldn't be a fsck error, for forward
1439 			 * compatibility; the rebalance code should just refetch
1440 			 * the compression opt if it's unknown
1441 			 */
1442 #if 0
1443 			const struct bch_extent_rebalance *r = &entry->rebalance;
1444 
1445 			if (!bch2_compression_opt_valid(r->compression)) {
1446 				struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
1447 				prt_printf(err, "invalid compression opt %u:%u",
1448 					   opt.type, opt.level);
1449 				return -BCH_ERR_invalid_bkey;
1450 			}
1451 #endif
1452 			break;
1453 		}
1454 		case BCH_EXTENT_ENTRY_flags:
1455 			bkey_fsck_err_on(entry != ptrs.start,
1456 					 c, extent_flags_not_at_start,
1457 					 "extent flags entry not at start");
1458 			break;
1459 		}
1460 	}
1461 
1462 	bkey_fsck_err_on(!nr_ptrs,
1463 			 c, extent_ptrs_no_ptrs,
1464 			 "no ptrs");
1465 	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
1466 			 c, extent_ptrs_too_many_ptrs,
1467 			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
1468 	bkey_fsck_err_on(have_written && have_unwritten,
1469 			 c, extent_ptrs_written_and_unwritten,
1470 			 "extent with unwritten and written ptrs");
1471 	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
1472 			 c, extent_ptrs_unwritten,
1473 			 "has unwritten ptrs");
1474 	bkey_fsck_err_on(crc_since_last_ptr,
1475 			 c, extent_ptrs_redundant_crc,
1476 			 "redundant crc entry");
1477 	bkey_fsck_err_on(have_ec,
1478 			 c, extent_ptrs_redundant_stripe,
1479 			 "redundant stripe entry");
1480 fsck_err:
1481 	return ret;
1482 }
1483 
bch2_ptr_swab(struct bkey_s k)1484 void bch2_ptr_swab(struct bkey_s k)
1485 {
1486 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1487 	union bch_extent_entry *entry;
1488 	u64 *d;
1489 
1490 	for (d =  (u64 *) ptrs.start;
1491 	     d != (u64 *) ptrs.end;
1492 	     d++)
1493 		*d = swab64(*d);
1494 
1495 	for (entry = ptrs.start;
1496 	     entry < ptrs.end;
1497 	     entry = extent_entry_next(entry)) {
1498 		switch (__extent_entry_type(entry)) {
1499 		case BCH_EXTENT_ENTRY_ptr:
1500 			break;
1501 		case BCH_EXTENT_ENTRY_crc32:
1502 			entry->crc32.csum = swab32(entry->crc32.csum);
1503 			break;
1504 		case BCH_EXTENT_ENTRY_crc64:
1505 			entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
1506 			entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
1507 			break;
1508 		case BCH_EXTENT_ENTRY_crc128:
1509 			entry->crc128.csum.hi = (__force __le64)
1510 				swab64((__force u64) entry->crc128.csum.hi);
1511 			entry->crc128.csum.lo = (__force __le64)
1512 				swab64((__force u64) entry->crc128.csum.lo);
1513 			break;
1514 		case BCH_EXTENT_ENTRY_stripe_ptr:
1515 			break;
1516 		case BCH_EXTENT_ENTRY_rebalance:
1517 			break;
1518 		default:
1519 			/* Bad entry type: will be caught by validate() */
1520 			return;
1521 		}
1522 	}
1523 }
1524 
bch2_bkey_extent_flags_set(struct bch_fs * c,struct bkey_i * k,u64 flags)1525 int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
1526 {
1527 	int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
1528 	if (ret)
1529 		return ret;
1530 
1531 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
1532 
1533 	if (ptrs.start != ptrs.end &&
1534 	    extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
1535 		ptrs.start->flags.flags = flags;
1536 	} else {
1537 		struct bch_extent_flags f = {
1538 			.type	= BIT(BCH_EXTENT_ENTRY_flags),
1539 			.flags	= flags,
1540 		};
1541 		__extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
1542 	}
1543 
1544 	return 0;
1545 }
1546 
1547 /* Generic extent code: */
1548 
bch2_cut_front_s(struct bpos where,struct bkey_s k)1549 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
1550 {
1551 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1552 	int val_u64s_delta;
1553 	u64 sub;
1554 
1555 	if (bkey_le(where, bkey_start_pos(k.k)))
1556 		return 0;
1557 
1558 	EBUG_ON(bkey_gt(where, k.k->p));
1559 
1560 	sub = where.offset - bkey_start_offset(k.k);
1561 
1562 	k.k->size -= sub;
1563 
1564 	if (!k.k->size) {
1565 		k.k->type = KEY_TYPE_deleted;
1566 		new_val_u64s = 0;
1567 	}
1568 
1569 	switch (k.k->type) {
1570 	case KEY_TYPE_extent:
1571 	case KEY_TYPE_reflink_v: {
1572 		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
1573 		union bch_extent_entry *entry;
1574 		bool seen_crc = false;
1575 
1576 		bkey_extent_entry_for_each(ptrs, entry) {
1577 			switch (extent_entry_type(entry)) {
1578 			case BCH_EXTENT_ENTRY_ptr:
1579 				if (!seen_crc)
1580 					entry->ptr.offset += sub;
1581 				break;
1582 			case BCH_EXTENT_ENTRY_crc32:
1583 				entry->crc32.offset += sub;
1584 				break;
1585 			case BCH_EXTENT_ENTRY_crc64:
1586 				entry->crc64.offset += sub;
1587 				break;
1588 			case BCH_EXTENT_ENTRY_crc128:
1589 				entry->crc128.offset += sub;
1590 				break;
1591 			case BCH_EXTENT_ENTRY_stripe_ptr:
1592 			case BCH_EXTENT_ENTRY_rebalance:
1593 			case BCH_EXTENT_ENTRY_flags:
1594 				break;
1595 			}
1596 
1597 			if (extent_entry_is_crc(entry))
1598 				seen_crc = true;
1599 		}
1600 
1601 		break;
1602 	}
1603 	case KEY_TYPE_reflink_p: {
1604 		struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
1605 
1606 		SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
1607 		break;
1608 	}
1609 	case KEY_TYPE_inline_data:
1610 	case KEY_TYPE_indirect_inline_data: {
1611 		void *p = bkey_inline_data_p(k);
1612 		unsigned bytes = bkey_inline_data_bytes(k.k);
1613 
1614 		sub = min_t(u64, sub << 9, bytes);
1615 
1616 		memmove(p, p + sub, bytes - sub);
1617 
1618 		new_val_u64s -= sub >> 3;
1619 		break;
1620 	}
1621 	}
1622 
1623 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1624 	BUG_ON(val_u64s_delta < 0);
1625 
1626 	set_bkey_val_u64s(k.k, new_val_u64s);
1627 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1628 	return -val_u64s_delta;
1629 }
1630 
bch2_cut_back_s(struct bpos where,struct bkey_s k)1631 int bch2_cut_back_s(struct bpos where, struct bkey_s k)
1632 {
1633 	unsigned new_val_u64s = bkey_val_u64s(k.k);
1634 	int val_u64s_delta;
1635 	u64 len = 0;
1636 
1637 	if (bkey_ge(where, k.k->p))
1638 		return 0;
1639 
1640 	EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
1641 
1642 	len = where.offset - bkey_start_offset(k.k);
1643 
1644 	k.k->p.offset = where.offset;
1645 	k.k->size = len;
1646 
1647 	if (!len) {
1648 		k.k->type = KEY_TYPE_deleted;
1649 		new_val_u64s = 0;
1650 	}
1651 
1652 	switch (k.k->type) {
1653 	case KEY_TYPE_inline_data:
1654 	case KEY_TYPE_indirect_inline_data:
1655 		new_val_u64s = (bkey_inline_data_offset(k.k) +
1656 				min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
1657 		break;
1658 	}
1659 
1660 	val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
1661 	BUG_ON(val_u64s_delta < 0);
1662 
1663 	set_bkey_val_u64s(k.k, new_val_u64s);
1664 	memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
1665 	return -val_u64s_delta;
1666 }
1667