1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14 
15 #include <linux/random.h>
16 
17 /*
18  * Snapshot trees:
19  *
20  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21  * exist to provide a stable identifier for the whole lifetime of a snapshot
22  * tree.
23  */
24 
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 				struct bkey_s_c k)
27 {
28 	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29 
30 	prt_printf(out, "subvol %u root snapshot %u",
31 		   le32_to_cpu(t.v->master_subvol),
32 		   le32_to_cpu(t.v->root_snapshot));
33 }
34 
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 				struct bkey_validate_context from)
37 {
38 	int ret = 0;
39 
40 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 			 bkey_lt(k.k->p, POS(0, 1)),
42 			 c, snapshot_tree_pos_bad,
43 			 "bad pos");
44 fsck_err:
45 	return ret;
46 }
47 
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 			      struct bch_snapshot_tree *s)
50 {
51 	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 					  BTREE_ITER_with_updates, snapshot_tree, s);
53 
54 	if (bch2_err_matches(ret, ENOENT))
55 		ret = -BCH_ERR_ENOENT_snapshot_tree;
56 	return ret;
57 }
58 
59 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 	struct btree_iter iter;
63 	int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 	struct bkey_i_snapshot_tree *s_t;
66 
67 	if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 		ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 	if (ret)
70 		return ERR_PTR(ret);
71 
72 	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 	ret = PTR_ERR_OR_ZERO(s_t);
74 	bch2_trans_iter_exit(trans, &iter);
75 	return ret ? ERR_PTR(ret) : s_t;
76 }
77 
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 				u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 	struct bkey_i_snapshot_tree *n_tree =
82 		__bch2_snapshot_tree_create(trans);
83 
84 	if (IS_ERR(n_tree))
85 		return PTR_ERR(n_tree);
86 
87 	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
88 	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
89 	*tree_id = n_tree->k.p.offset;
90 	return 0;
91 }
92 
93 /* Snapshot nodes: */
94 
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 	while (id && id < ancestor) {
98 		const struct snapshot_t *s = __snapshot_t(t, id);
99 		id = s ? s->parent : 0;
100 	}
101 	return id == ancestor;
102 }
103 
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 	rcu_read_lock();
107 	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 	rcu_read_unlock();
109 
110 	return ret;
111 }
112 
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 	const struct snapshot_t *s = __snapshot_t(t, id);
116 	if (!s)
117 		return 0;
118 
119 	if (s->skip[2] <= ancestor)
120 		return s->skip[2];
121 	if (s->skip[1] <= ancestor)
122 		return s->skip[1];
123 	if (s->skip[0] <= ancestor)
124 		return s->skip[0];
125 	return s->parent;
126 }
127 
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 	const struct snapshot_t *s = __snapshot_t(t, id);
131 	if (!s)
132 		return false;
133 
134 	return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136 
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 	bool ret;
140 
141 	rcu_read_lock();
142 	struct snapshot_table *t = rcu_dereference(c->snapshots);
143 
144 	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 		goto out;
147 	}
148 
149 	if (likely(ancestor >= IS_ANCESTOR_BITMAP))
150 		while (id && id < ancestor - IS_ANCESTOR_BITMAP)
151 			id = get_ancestor_below(t, id, ancestor);
152 
153 	ret = id && id < ancestor
154 		? test_ancestor_bitmap(t, id, ancestor)
155 		: id == ancestor;
156 
157 	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
158 out:
159 	rcu_read_unlock();
160 
161 	return ret;
162 }
163 
__snapshot_t_mut(struct bch_fs * c,u32 id)164 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
165 {
166 	size_t idx = U32_MAX - id;
167 	struct snapshot_table *new, *old;
168 
169 	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
170 	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
171 
172 	if (unlikely(new_bytes > INT_MAX))
173 		return NULL;
174 
175 	new = kvzalloc(new_bytes, GFP_KERNEL);
176 	if (!new)
177 		return NULL;
178 
179 	new->nr = new_size;
180 
181 	old = rcu_dereference_protected(c->snapshots, true);
182 	if (old)
183 		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
184 
185 	rcu_assign_pointer(c->snapshots, new);
186 	kvfree_rcu(old, rcu);
187 
188 	return &rcu_dereference_protected(c->snapshots,
189 				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
190 }
191 
snapshot_t_mut(struct bch_fs * c,u32 id)192 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
193 {
194 	size_t idx = U32_MAX - id;
195 	struct snapshot_table *table =
196 		rcu_dereference_protected(c->snapshots,
197 				lockdep_is_held(&c->snapshot_table_lock));
198 
199 	lockdep_assert_held(&c->snapshot_table_lock);
200 
201 	if (likely(table && idx < table->nr))
202 		return &table->s[idx];
203 
204 	return __snapshot_t_mut(c, id);
205 }
206 
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)207 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
208 			   struct bkey_s_c k)
209 {
210 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
211 
212 	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
213 	       BCH_SNAPSHOT_SUBVOL(s.v),
214 	       BCH_SNAPSHOT_DELETED(s.v),
215 	       le32_to_cpu(s.v->parent),
216 	       le32_to_cpu(s.v->children[0]),
217 	       le32_to_cpu(s.v->children[1]),
218 	       le32_to_cpu(s.v->subvol),
219 	       le32_to_cpu(s.v->tree));
220 
221 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
222 		prt_printf(out, " depth %u skiplist %u %u %u",
223 			   le32_to_cpu(s.v->depth),
224 			   le32_to_cpu(s.v->skip[0]),
225 			   le32_to_cpu(s.v->skip[1]),
226 			   le32_to_cpu(s.v->skip[2]));
227 }
228 
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)229 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
230 			   struct bkey_validate_context from)
231 {
232 	struct bkey_s_c_snapshot s;
233 	u32 i, id;
234 	int ret = 0;
235 
236 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
237 			 bkey_lt(k.k->p, POS(0, 1)),
238 			 c, snapshot_pos_bad,
239 			 "bad pos");
240 
241 	s = bkey_s_c_to_snapshot(k);
242 
243 	id = le32_to_cpu(s.v->parent);
244 	bkey_fsck_err_on(id && id <= k.k->p.offset,
245 			 c, snapshot_parent_bad,
246 			 "bad parent node (%u <= %llu)",
247 			 id, k.k->p.offset);
248 
249 	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
250 			 c, snapshot_children_not_normalized,
251 			 "children not normalized");
252 
253 	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
254 			 c, snapshot_child_duplicate,
255 			 "duplicate child nodes");
256 
257 	for (i = 0; i < 2; i++) {
258 		id = le32_to_cpu(s.v->children[i]);
259 
260 		bkey_fsck_err_on(id >= k.k->p.offset,
261 				 c, snapshot_child_bad,
262 				 "bad child node (%u >= %llu)",
263 				 id, k.k->p.offset);
264 	}
265 
266 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
267 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
268 				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
269 				 c, snapshot_skiplist_not_normalized,
270 				 "skiplist not normalized");
271 
272 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
273 			id = le32_to_cpu(s.v->skip[i]);
274 
275 			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
276 					 c, snapshot_skiplist_bad,
277 					 "bad skiplist node %u", id);
278 		}
279 	}
280 fsck_err:
281 	return ret;
282 }
283 
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)284 static int __bch2_mark_snapshot(struct btree_trans *trans,
285 		       enum btree_id btree, unsigned level,
286 		       struct bkey_s_c old, struct bkey_s_c new,
287 		       enum btree_iter_update_trigger_flags flags)
288 {
289 	struct bch_fs *c = trans->c;
290 	struct snapshot_t *t;
291 	u32 id = new.k->p.offset;
292 	int ret = 0;
293 
294 	mutex_lock(&c->snapshot_table_lock);
295 
296 	t = snapshot_t_mut(c, id);
297 	if (!t) {
298 		ret = -BCH_ERR_ENOMEM_mark_snapshot;
299 		goto err;
300 	}
301 
302 	if (new.k->type == KEY_TYPE_snapshot) {
303 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
304 
305 		t->live		= true;
306 		t->parent	= le32_to_cpu(s.v->parent);
307 		t->children[0]	= le32_to_cpu(s.v->children[0]);
308 		t->children[1]	= le32_to_cpu(s.v->children[1]);
309 		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
310 		t->tree		= le32_to_cpu(s.v->tree);
311 
312 		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
313 			t->depth	= le32_to_cpu(s.v->depth);
314 			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
315 			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
316 			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
317 		} else {
318 			t->depth	= 0;
319 			t->skip[0]	= 0;
320 			t->skip[1]	= 0;
321 			t->skip[2]	= 0;
322 		}
323 
324 		u32 parent = id;
325 
326 		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
327 		       parent - id - 1 < IS_ANCESTOR_BITMAP)
328 			__set_bit(parent - id - 1, t->is_ancestor);
329 
330 		if (BCH_SNAPSHOT_DELETED(s.v)) {
331 			set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
332 			if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
333 				bch2_delete_dead_snapshots_async(c);
334 		}
335 	} else {
336 		memset(t, 0, sizeof(*t));
337 	}
338 err:
339 	mutex_unlock(&c->snapshot_table_lock);
340 	return ret;
341 }
342 
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)343 int bch2_mark_snapshot(struct btree_trans *trans,
344 		       enum btree_id btree, unsigned level,
345 		       struct bkey_s_c old, struct bkey_s new,
346 		       enum btree_iter_update_trigger_flags flags)
347 {
348 	return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
349 }
350 
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)351 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
352 			 struct bch_snapshot *s)
353 {
354 	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
355 				       BTREE_ITER_with_updates, snapshot, s);
356 }
357 
358 /* fsck: */
359 
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)360 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
361 {
362 	return snapshot_t(c, id)->children[child];
363 }
364 
bch2_snapshot_left_child(struct bch_fs * c,u32 id)365 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
366 {
367 	return bch2_snapshot_child(c, id, 0);
368 }
369 
bch2_snapshot_right_child(struct bch_fs * c,u32 id)370 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
371 {
372 	return bch2_snapshot_child(c, id, 1);
373 }
374 
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)375 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
376 {
377 	u32 n, parent;
378 
379 	n = bch2_snapshot_left_child(c, id);
380 	if (n)
381 		return n;
382 
383 	while ((parent = bch2_snapshot_parent(c, id))) {
384 		n = bch2_snapshot_right_child(c, parent);
385 		if (n && n != id)
386 			return n;
387 		id = parent;
388 	}
389 
390 	return 0;
391 }
392 
bch2_snapshot_tree_oldest_subvol(struct bch_fs * c,u32 snapshot_root)393 u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
394 {
395 	u32 id = snapshot_root;
396 	u32 subvol = 0, s;
397 
398 	rcu_read_lock();
399 	while (id && bch2_snapshot_exists(c, id)) {
400 		s = snapshot_t(c, id)->subvol;
401 
402 		if (s && (!subvol || s < subvol))
403 			subvol = s;
404 
405 		id = bch2_snapshot_tree_next(c, id);
406 	}
407 	rcu_read_unlock();
408 
409 	return subvol;
410 }
411 
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)412 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
413 					    u32 snapshot_root, u32 *subvol_id)
414 {
415 	struct bch_fs *c = trans->c;
416 	struct btree_iter iter;
417 	struct bkey_s_c k;
418 	bool found = false;
419 	int ret;
420 
421 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
422 				     0, k, ret) {
423 		if (k.k->type != KEY_TYPE_subvolume)
424 			continue;
425 
426 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
427 		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
428 			continue;
429 		if (!BCH_SUBVOLUME_SNAP(s.v)) {
430 			*subvol_id = s.k->p.offset;
431 			found = true;
432 			break;
433 		}
434 	}
435 	bch2_trans_iter_exit(trans, &iter);
436 
437 	if (!ret && !found) {
438 		struct bkey_i_subvolume *u;
439 
440 		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
441 
442 		u = bch2_bkey_get_mut_typed(trans, &iter,
443 					    BTREE_ID_subvolumes, POS(0, *subvol_id),
444 					    0, subvolume);
445 		ret = PTR_ERR_OR_ZERO(u);
446 		if (ret)
447 			return ret;
448 
449 		SET_BCH_SUBVOLUME_SNAP(&u->v, false);
450 	}
451 
452 	return ret;
453 }
454 
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)455 static int check_snapshot_tree(struct btree_trans *trans,
456 			       struct btree_iter *iter,
457 			       struct bkey_s_c k)
458 {
459 	struct bch_fs *c = trans->c;
460 	struct bkey_s_c_snapshot_tree st;
461 	struct bch_snapshot s;
462 	struct bch_subvolume subvol;
463 	struct printbuf buf = PRINTBUF;
464 	struct btree_iter snapshot_iter = {};
465 	u32 root_id;
466 	int ret;
467 
468 	if (k.k->type != KEY_TYPE_snapshot_tree)
469 		return 0;
470 
471 	st = bkey_s_c_to_snapshot_tree(k);
472 	root_id = le32_to_cpu(st.v->root_snapshot);
473 
474 	struct bkey_s_c_snapshot snapshot_k =
475 		bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
476 					 POS(0, root_id), 0, snapshot);
477 	ret = bkey_err(snapshot_k);
478 	if (ret && !bch2_err_matches(ret, ENOENT))
479 		goto err;
480 
481 	if (!ret)
482 		bkey_val_copy(&s, snapshot_k);
483 
484 	if (fsck_err_on(ret ||
485 			root_id != bch2_snapshot_root(c, root_id) ||
486 			st.k->p.offset != le32_to_cpu(s.tree),
487 			trans, snapshot_tree_to_missing_snapshot,
488 			"snapshot tree points to missing/incorrect snapshot:\n%s",
489 			(bch2_bkey_val_to_text(&buf, c, st.s_c),
490 			 prt_newline(&buf),
491 			 ret
492 			 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
493 			 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
494 			 buf.buf))) {
495 		ret = bch2_btree_delete_at(trans, iter, 0);
496 		goto err;
497 	}
498 
499 	if (!st.v->master_subvol)
500 		goto out;
501 
502 	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
503 	if (ret && !bch2_err_matches(ret, ENOENT))
504 		goto err;
505 
506 	if (fsck_err_on(ret,
507 			trans, snapshot_tree_to_missing_subvol,
508 			"snapshot tree points to missing subvolume:\n%s",
509 			(printbuf_reset(&buf),
510 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
511 	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
512 						le32_to_cpu(subvol.snapshot),
513 						root_id),
514 			trans, snapshot_tree_to_wrong_subvol,
515 			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
516 			(printbuf_reset(&buf),
517 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
518 	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
519 			trans, snapshot_tree_to_snapshot_subvol,
520 			"snapshot tree points to snapshot subvolume:\n%s",
521 			(printbuf_reset(&buf),
522 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
523 		struct bkey_i_snapshot_tree *u;
524 		u32 subvol_id;
525 
526 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
527 		bch_err_fn(c, ret);
528 
529 		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
530 			ret = 0;
531 			goto err;
532 		}
533 
534 		if (ret)
535 			goto err;
536 
537 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
538 		ret = PTR_ERR_OR_ZERO(u);
539 		if (ret)
540 			goto err;
541 
542 		u->v.master_subvol = cpu_to_le32(subvol_id);
543 		st = snapshot_tree_i_to_s_c(u);
544 	}
545 out:
546 err:
547 fsck_err:
548 	bch2_trans_iter_exit(trans, &snapshot_iter);
549 	printbuf_exit(&buf);
550 	return ret;
551 }
552 
553 /*
554  * For each snapshot_tree, make sure it points to the root of a snapshot tree
555  * and that snapshot entry points back to it, or delete it.
556  *
557  * And, make sure it points to a subvolume within that snapshot tree, or correct
558  * it to point to the oldest subvolume within that snapshot tree.
559  */
bch2_check_snapshot_trees(struct bch_fs * c)560 int bch2_check_snapshot_trees(struct bch_fs *c)
561 {
562 	int ret = bch2_trans_run(c,
563 		for_each_btree_key_commit(trans, iter,
564 			BTREE_ID_snapshot_trees, POS_MIN,
565 			BTREE_ITER_prefetch, k,
566 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
567 		check_snapshot_tree(trans, &iter, k)));
568 	bch_err_fn(c, ret);
569 	return ret;
570 }
571 
572 /*
573  * Look up snapshot tree for @tree_id and find root,
574  * make sure @snap_id is a descendent:
575  */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)576 static int snapshot_tree_ptr_good(struct btree_trans *trans,
577 				  u32 snap_id, u32 tree_id)
578 {
579 	struct bch_snapshot_tree s_t;
580 	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
581 
582 	if (bch2_err_matches(ret, ENOENT))
583 		return 0;
584 	if (ret)
585 		return ret;
586 
587 	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
588 }
589 
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)590 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
591 {
592 	const struct snapshot_t *s;
593 
594 	if (!id)
595 		return 0;
596 
597 	rcu_read_lock();
598 	s = snapshot_t(c, id);
599 	if (s->parent)
600 		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
601 	rcu_read_unlock();
602 
603 	return id;
604 }
605 
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)606 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
607 {
608 	unsigned i;
609 
610 	for (i = 0; i < 3; i++)
611 		if (!s.parent) {
612 			if (s.skip[i])
613 				return false;
614 		} else {
615 			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
616 				return false;
617 		}
618 
619 	return true;
620 }
621 
622 /*
623  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
624  * its snapshot_tree pointer is correct (allocate new one if necessary), then
625  * update this node's pointer to root node's pointer:
626  */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)627 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
628 				    struct btree_iter *iter,
629 				    struct bkey_s_c k,
630 				    struct bch_snapshot *s)
631 {
632 	struct bch_fs *c = trans->c;
633 	struct btree_iter root_iter;
634 	struct bch_snapshot_tree s_t;
635 	struct bkey_s_c_snapshot root;
636 	struct bkey_i_snapshot *u;
637 	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
638 	int ret;
639 
640 	root = bch2_bkey_get_iter_typed(trans, &root_iter,
641 			       BTREE_ID_snapshots, POS(0, root_id),
642 			       BTREE_ITER_with_updates, snapshot);
643 	ret = bkey_err(root);
644 	if (ret)
645 		goto err;
646 
647 	tree_id = le32_to_cpu(root.v->tree);
648 
649 	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
650 	if (ret && !bch2_err_matches(ret, ENOENT))
651 		return ret;
652 
653 	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
654 		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
655 		ret =   PTR_ERR_OR_ZERO(u) ?:
656 			bch2_snapshot_tree_create(trans, root_id,
657 				bch2_snapshot_tree_oldest_subvol(c, root_id),
658 				&tree_id);
659 		if (ret)
660 			goto err;
661 
662 		u->v.tree = cpu_to_le32(tree_id);
663 		if (k.k->p.offset == root_id)
664 			*s = u->v;
665 	}
666 
667 	if (k.k->p.offset != root_id) {
668 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
669 		ret = PTR_ERR_OR_ZERO(u);
670 		if (ret)
671 			goto err;
672 
673 		u->v.tree = cpu_to_le32(tree_id);
674 		*s = u->v;
675 	}
676 err:
677 	bch2_trans_iter_exit(trans, &root_iter);
678 	return ret;
679 }
680 
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)681 static int check_snapshot(struct btree_trans *trans,
682 			  struct btree_iter *iter,
683 			  struct bkey_s_c k)
684 {
685 	struct bch_fs *c = trans->c;
686 	struct bch_snapshot s;
687 	struct bch_subvolume subvol;
688 	struct bch_snapshot v;
689 	struct bkey_i_snapshot *u;
690 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
691 	u32 real_depth;
692 	struct printbuf buf = PRINTBUF;
693 	u32 i, id;
694 	int ret = 0;
695 
696 	if (k.k->type != KEY_TYPE_snapshot)
697 		return 0;
698 
699 	memset(&s, 0, sizeof(s));
700 	memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
701 
702 	id = le32_to_cpu(s.parent);
703 	if (id) {
704 		ret = bch2_snapshot_lookup(trans, id, &v);
705 		if (bch2_err_matches(ret, ENOENT))
706 			bch_err(c, "snapshot with nonexistent parent:\n  %s",
707 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
708 		if (ret)
709 			goto err;
710 
711 		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
712 		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
713 			bch_err(c, "snapshot parent %u missing pointer to child %llu",
714 				id, k.k->p.offset);
715 			ret = -EINVAL;
716 			goto err;
717 		}
718 	}
719 
720 	for (i = 0; i < 2 && s.children[i]; i++) {
721 		id = le32_to_cpu(s.children[i]);
722 
723 		ret = bch2_snapshot_lookup(trans, id, &v);
724 		if (bch2_err_matches(ret, ENOENT))
725 			bch_err(c, "snapshot node %llu has nonexistent child %u",
726 				k.k->p.offset, id);
727 		if (ret)
728 			goto err;
729 
730 		if (le32_to_cpu(v.parent) != k.k->p.offset) {
731 			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
732 				id, le32_to_cpu(v.parent), k.k->p.offset);
733 			ret = -EINVAL;
734 			goto err;
735 		}
736 	}
737 
738 	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
739 		!BCH_SNAPSHOT_DELETED(&s);
740 
741 	if (should_have_subvol) {
742 		id = le32_to_cpu(s.subvol);
743 		ret = bch2_subvolume_get(trans, id, false, &subvol);
744 		if (bch2_err_matches(ret, ENOENT))
745 			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
746 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
747 		if (ret)
748 			goto err;
749 
750 		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
751 			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
752 				k.k->p.offset);
753 			ret = -EINVAL;
754 			goto err;
755 		}
756 	} else {
757 		if (fsck_err_on(s.subvol,
758 				trans, snapshot_should_not_have_subvol,
759 				"snapshot should not point to subvol:\n%s",
760 				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
761 			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
762 			ret = PTR_ERR_OR_ZERO(u);
763 			if (ret)
764 				goto err;
765 
766 			u->v.subvol = 0;
767 			s = u->v;
768 		}
769 	}
770 
771 	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
772 	if (ret < 0)
773 		goto err;
774 
775 	if (fsck_err_on(!ret,
776 			trans, snapshot_to_bad_snapshot_tree,
777 			"snapshot points to missing/incorrect tree:\n%s",
778 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
779 		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
780 		if (ret)
781 			goto err;
782 	}
783 	ret = 0;
784 
785 	real_depth = bch2_snapshot_depth(c, parent_id);
786 
787 	if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
788 			trans, snapshot_bad_depth,
789 			"snapshot with incorrect depth field, should be %u:\n%s",
790 			real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
791 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
792 		ret = PTR_ERR_OR_ZERO(u);
793 		if (ret)
794 			goto err;
795 
796 		u->v.depth = cpu_to_le32(real_depth);
797 		s = u->v;
798 	}
799 
800 	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
801 	if (ret < 0)
802 		goto err;
803 
804 	if (fsck_err_on(!ret,
805 			trans, snapshot_bad_skiplist,
806 			"snapshot with bad skiplist field:\n%s",
807 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808 		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
809 		ret = PTR_ERR_OR_ZERO(u);
810 		if (ret)
811 			goto err;
812 
813 		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
814 			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
815 
816 		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
817 		s = u->v;
818 	}
819 	ret = 0;
820 err:
821 fsck_err:
822 	printbuf_exit(&buf);
823 	return ret;
824 }
825 
bch2_check_snapshots(struct bch_fs * c)826 int bch2_check_snapshots(struct bch_fs *c)
827 {
828 	/*
829 	 * We iterate backwards as checking/fixing the depth field requires that
830 	 * the parent's depth already be correct:
831 	 */
832 	int ret = bch2_trans_run(c,
833 		for_each_btree_key_reverse_commit(trans, iter,
834 				BTREE_ID_snapshots, POS_MAX,
835 				BTREE_ITER_prefetch, k,
836 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
837 			check_snapshot(trans, &iter, k)));
838 	bch_err_fn(c, ret);
839 	return ret;
840 }
841 
check_snapshot_exists(struct btree_trans * trans,u32 id)842 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
843 {
844 	struct bch_fs *c = trans->c;
845 
846 	/* Do we need to reconstruct the snapshot_tree entry as well? */
847 	struct btree_iter iter;
848 	struct bkey_s_c k;
849 	int ret = 0;
850 	u32 tree_id = 0;
851 
852 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
853 				     0, k, ret) {
854 		if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
855 			tree_id = k.k->p.offset;
856 			break;
857 		}
858 	}
859 	bch2_trans_iter_exit(trans, &iter);
860 
861 	if (ret)
862 		return ret;
863 
864 	if (!tree_id) {
865 		ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
866 		if (ret)
867 			return ret;
868 	}
869 
870 	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
871 	ret = PTR_ERR_OR_ZERO(snapshot);
872 	if (ret)
873 		return ret;
874 
875 	bkey_snapshot_init(&snapshot->k_i);
876 	snapshot->k.p		= POS(0, id);
877 	snapshot->v.tree	= cpu_to_le32(tree_id);
878 	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
879 
880 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
881 				     0, k, ret) {
882 		if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
883 			snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
884 			SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
885 			break;
886 		}
887 	}
888 	bch2_trans_iter_exit(trans, &iter);
889 
890 	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
891 		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
892 				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
893 }
894 
895 /* Figure out which snapshot nodes belong in the same tree: */
896 struct snapshot_tree_reconstruct {
897 	enum btree_id			btree;
898 	struct bpos			cur_pos;
899 	snapshot_id_list		cur_ids;
900 	DARRAY(snapshot_id_list)	trees;
901 };
902 
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)903 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
904 {
905 	darray_for_each(r->trees, i)
906 		darray_exit(i);
907 	darray_exit(&r->trees);
908 	darray_exit(&r->cur_ids);
909 }
910 
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)911 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
912 {
913 	return r->btree == BTREE_ID_inodes
914 		? r->cur_pos.offset == pos.offset
915 		: r->cur_pos.inode == pos.inode;
916 }
917 
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)918 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
919 {
920 	darray_for_each(*l, i)
921 		if (snapshot_list_has_id(r, *i))
922 			return true;
923 	return false;
924 }
925 
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)926 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
927 {
928 	bool first = true;
929 	darray_for_each(*s, i) {
930 		if (!first)
931 			prt_char(out, ' ');
932 		first = false;
933 		prt_printf(out, "%u", *i);
934 	}
935 }
936 
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)937 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
938 {
939 	if (r->cur_ids.nr) {
940 		darray_for_each(r->trees, i)
941 			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
942 				int ret = snapshot_list_merge(c, i, &r->cur_ids);
943 				if (ret)
944 					return ret;
945 				goto out;
946 			}
947 		darray_push(&r->trees, r->cur_ids);
948 		darray_init(&r->cur_ids);
949 	}
950 out:
951 	r->cur_ids.nr = 0;
952 	return 0;
953 }
954 
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)955 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
956 {
957 	if (!same_snapshot(r, pos))
958 		snapshot_tree_reconstruct_next(c, r);
959 	r->cur_pos = pos;
960 	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
961 }
962 
bch2_reconstruct_snapshots(struct bch_fs * c)963 int bch2_reconstruct_snapshots(struct bch_fs *c)
964 {
965 	struct btree_trans *trans = bch2_trans_get(c);
966 	struct printbuf buf = PRINTBUF;
967 	struct snapshot_tree_reconstruct r = {};
968 	int ret = 0;
969 
970 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
971 		if (btree_type_has_snapshots(btree)) {
972 			r.btree = btree;
973 
974 			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
975 					BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
976 				get_snapshot_trees(c, &r, k.k->p);
977 			}));
978 			if (ret)
979 				goto err;
980 
981 			snapshot_tree_reconstruct_next(c, &r);
982 		}
983 	}
984 
985 	darray_for_each(r.trees, t) {
986 		printbuf_reset(&buf);
987 		snapshot_id_list_to_text(&buf, t);
988 
989 		darray_for_each(*t, id) {
990 			if (fsck_err_on(!bch2_snapshot_exists(c, *id),
991 					trans, snapshot_node_missing,
992 					"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
993 				if (t->nr > 1) {
994 					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
995 					ret = -BCH_ERR_fsck_repair_unimplemented;
996 					goto err;
997 				}
998 
999 				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1000 						check_snapshot_exists(trans, *id));
1001 				if (ret)
1002 					goto err;
1003 			}
1004 		}
1005 	}
1006 fsck_err:
1007 err:
1008 	bch2_trans_put(trans);
1009 	snapshot_tree_reconstruct_exit(&r);
1010 	printbuf_exit(&buf);
1011 	bch_err_fn(c, ret);
1012 	return ret;
1013 }
1014 
bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1015 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1016 				struct btree_iter *iter,
1017 				struct bkey_s_c k)
1018 {
1019 	struct bch_fs *c = trans->c;
1020 	struct printbuf buf = PRINTBUF;
1021 	int ret = 0;
1022 
1023 	if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1024 			trans, bkey_in_missing_snapshot,
1025 			"key in missing snapshot %s, delete?",
1026 			(bch2_btree_id_to_text(&buf, iter->btree_id),
1027 			 prt_char(&buf, ' '),
1028 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1029 		ret = bch2_btree_delete_at(trans, iter,
1030 					    BTREE_UPDATE_internal_snapshot_node) ?: 1;
1031 fsck_err:
1032 	printbuf_exit(&buf);
1033 	return ret;
1034 }
1035 
1036 /*
1037  * Mark a snapshot as deleted, for future cleanup:
1038  */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1039 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1040 {
1041 	struct btree_iter iter;
1042 	struct bkey_i_snapshot *s =
1043 		bch2_bkey_get_mut_typed(trans, &iter,
1044 				    BTREE_ID_snapshots, POS(0, id),
1045 				    0, snapshot);
1046 	int ret = PTR_ERR_OR_ZERO(s);
1047 	if (unlikely(ret)) {
1048 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1049 					trans->c, "missing snapshot %u", id);
1050 		return ret;
1051 	}
1052 
1053 	/* already deleted? */
1054 	if (BCH_SNAPSHOT_DELETED(&s->v))
1055 		goto err;
1056 
1057 	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1058 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1059 	s->v.subvol = 0;
1060 err:
1061 	bch2_trans_iter_exit(trans, &iter);
1062 	return ret;
1063 }
1064 
normalize_snapshot_child_pointers(struct bch_snapshot * s)1065 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1066 {
1067 	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1068 		swap(s->children[0], s->children[1]);
1069 }
1070 
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1071 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1072 {
1073 	struct bch_fs *c = trans->c;
1074 	struct btree_iter iter, p_iter = {};
1075 	struct btree_iter c_iter = {};
1076 	struct btree_iter tree_iter = {};
1077 	struct bkey_s_c_snapshot s;
1078 	u32 parent_id, child_id;
1079 	unsigned i;
1080 	int ret = 0;
1081 
1082 	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1083 				     BTREE_ITER_intent, snapshot);
1084 	ret = bkey_err(s);
1085 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1086 				"missing snapshot %u", id);
1087 
1088 	if (ret)
1089 		goto err;
1090 
1091 	BUG_ON(s.v->children[1]);
1092 
1093 	parent_id = le32_to_cpu(s.v->parent);
1094 	child_id = le32_to_cpu(s.v->children[0]);
1095 
1096 	if (parent_id) {
1097 		struct bkey_i_snapshot *parent;
1098 
1099 		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1100 				     BTREE_ID_snapshots, POS(0, parent_id),
1101 				     0, snapshot);
1102 		ret = PTR_ERR_OR_ZERO(parent);
1103 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1104 					"missing snapshot %u", parent_id);
1105 		if (unlikely(ret))
1106 			goto err;
1107 
1108 		/* find entry in parent->children for node being deleted */
1109 		for (i = 0; i < 2; i++)
1110 			if (le32_to_cpu(parent->v.children[i]) == id)
1111 				break;
1112 
1113 		if (bch2_fs_inconsistent_on(i == 2, c,
1114 					"snapshot %u missing child pointer to %u",
1115 					parent_id, id))
1116 			goto err;
1117 
1118 		parent->v.children[i] = cpu_to_le32(child_id);
1119 
1120 		normalize_snapshot_child_pointers(&parent->v);
1121 	}
1122 
1123 	if (child_id) {
1124 		struct bkey_i_snapshot *child;
1125 
1126 		child = bch2_bkey_get_mut_typed(trans, &c_iter,
1127 				     BTREE_ID_snapshots, POS(0, child_id),
1128 				     0, snapshot);
1129 		ret = PTR_ERR_OR_ZERO(child);
1130 		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1131 					"missing snapshot %u", child_id);
1132 		if (unlikely(ret))
1133 			goto err;
1134 
1135 		child->v.parent = cpu_to_le32(parent_id);
1136 
1137 		if (!child->v.parent) {
1138 			child->v.skip[0] = 0;
1139 			child->v.skip[1] = 0;
1140 			child->v.skip[2] = 0;
1141 		}
1142 	}
1143 
1144 	if (!parent_id) {
1145 		/*
1146 		 * We're deleting the root of a snapshot tree: update the
1147 		 * snapshot_tree entry to point to the new root, or delete it if
1148 		 * this is the last snapshot ID in this tree:
1149 		 */
1150 		struct bkey_i_snapshot_tree *s_t;
1151 
1152 		BUG_ON(s.v->children[1]);
1153 
1154 		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1155 				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1156 				0, snapshot_tree);
1157 		ret = PTR_ERR_OR_ZERO(s_t);
1158 		if (ret)
1159 			goto err;
1160 
1161 		if (s.v->children[0]) {
1162 			s_t->v.root_snapshot = s.v->children[0];
1163 		} else {
1164 			s_t->k.type = KEY_TYPE_deleted;
1165 			set_bkey_val_u64s(&s_t->k, 0);
1166 		}
1167 	}
1168 
1169 	ret = bch2_btree_delete_at(trans, &iter, 0);
1170 err:
1171 	bch2_trans_iter_exit(trans, &tree_iter);
1172 	bch2_trans_iter_exit(trans, &p_iter);
1173 	bch2_trans_iter_exit(trans, &c_iter);
1174 	bch2_trans_iter_exit(trans, &iter);
1175 	return ret;
1176 }
1177 
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1178 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1179 			  u32 *new_snapids,
1180 			  u32 *snapshot_subvols,
1181 			  unsigned nr_snapids)
1182 {
1183 	struct bch_fs *c = trans->c;
1184 	struct btree_iter iter;
1185 	struct bkey_i_snapshot *n;
1186 	struct bkey_s_c k;
1187 	unsigned i, j;
1188 	u32 depth = bch2_snapshot_depth(c, parent);
1189 	int ret;
1190 
1191 	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1192 			     POS_MIN, BTREE_ITER_intent);
1193 	k = bch2_btree_iter_peek(trans, &iter);
1194 	ret = bkey_err(k);
1195 	if (ret)
1196 		goto err;
1197 
1198 	for (i = 0; i < nr_snapids; i++) {
1199 		k = bch2_btree_iter_prev_slot(trans, &iter);
1200 		ret = bkey_err(k);
1201 		if (ret)
1202 			goto err;
1203 
1204 		if (!k.k || !k.k->p.offset) {
1205 			ret = -BCH_ERR_ENOSPC_snapshot_create;
1206 			goto err;
1207 		}
1208 
1209 		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1210 		ret = PTR_ERR_OR_ZERO(n);
1211 		if (ret)
1212 			goto err;
1213 
1214 		n->v.flags	= 0;
1215 		n->v.parent	= cpu_to_le32(parent);
1216 		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
1217 		n->v.tree	= cpu_to_le32(tree);
1218 		n->v.depth	= cpu_to_le32(depth);
1219 		n->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
1220 		n->v.btime.hi	= 0;
1221 
1222 		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1223 			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1224 
1225 		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1226 		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1227 
1228 		ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1229 					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1230 		if (ret)
1231 			goto err;
1232 
1233 		new_snapids[i]	= iter.pos.offset;
1234 	}
1235 err:
1236 	bch2_trans_iter_exit(trans, &iter);
1237 	return ret;
1238 }
1239 
1240 /*
1241  * Create new snapshot IDs as children of an existing snapshot ID:
1242  */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1243 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1244 			      u32 *new_snapids,
1245 			      u32 *snapshot_subvols,
1246 			      unsigned nr_snapids)
1247 {
1248 	struct btree_iter iter;
1249 	struct bkey_i_snapshot *n_parent;
1250 	int ret = 0;
1251 
1252 	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1253 			BTREE_ID_snapshots, POS(0, parent),
1254 			0, snapshot);
1255 	ret = PTR_ERR_OR_ZERO(n_parent);
1256 	if (unlikely(ret)) {
1257 		if (bch2_err_matches(ret, ENOENT))
1258 			bch_err(trans->c, "snapshot %u not found", parent);
1259 		return ret;
1260 	}
1261 
1262 	if (n_parent->v.children[0] || n_parent->v.children[1]) {
1263 		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1264 		ret = -EINVAL;
1265 		goto err;
1266 	}
1267 
1268 	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1269 			     new_snapids, snapshot_subvols, nr_snapids);
1270 	if (ret)
1271 		goto err;
1272 
1273 	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1274 	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1275 	n_parent->v.subvol = 0;
1276 	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1277 err:
1278 	bch2_trans_iter_exit(trans, &iter);
1279 	return ret;
1280 }
1281 
1282 /*
1283  * Create a snapshot node that is the root of a new tree:
1284  */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1285 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1286 			      u32 *new_snapids,
1287 			      u32 *snapshot_subvols,
1288 			      unsigned nr_snapids)
1289 {
1290 	struct bkey_i_snapshot_tree *n_tree;
1291 	int ret;
1292 
1293 	n_tree = __bch2_snapshot_tree_create(trans);
1294 	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1295 		create_snapids(trans, 0, n_tree->k.p.offset,
1296 			     new_snapids, snapshot_subvols, nr_snapids);
1297 	if (ret)
1298 		return ret;
1299 
1300 	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
1301 	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
1302 	return 0;
1303 }
1304 
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1305 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1306 			      u32 *new_snapids,
1307 			      u32 *snapshot_subvols,
1308 			      unsigned nr_snapids)
1309 {
1310 	BUG_ON((parent == 0) != (nr_snapids == 1));
1311 	BUG_ON((parent != 0) != (nr_snapids == 2));
1312 
1313 	return parent
1314 		? bch2_snapshot_node_create_children(trans, parent,
1315 				new_snapids, snapshot_subvols, nr_snapids)
1316 		: bch2_snapshot_node_create_tree(trans,
1317 				new_snapids, snapshot_subvols, nr_snapids);
1318 
1319 }
1320 
1321 /*
1322  * If we have an unlinked inode in an internal snapshot node, and the inode
1323  * really has been deleted in all child snapshots, how does this get cleaned up?
1324  *
1325  * first there is the problem of how keys that have been overwritten in all
1326  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1327  * special?
1328  *
1329  * also: unlinked inode in internal snapshot appears to not be getting deleted
1330  * correctly if inode doesn't exist in leaf snapshots
1331  *
1332  * solution:
1333  *
1334  * for a key in an interior snapshot node that needs work to be done that
1335  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1336  * that key to snapshot leaf nodes, where we can mutate it
1337  */
1338 
1339 struct snapshot_interior_delete {
1340 	u32	id;
1341 	u32	live_child;
1342 };
1343 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1344 
interior_delete_has_id(interior_delete_list * l,u32 id)1345 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1346 {
1347 	darray_for_each(*l, i)
1348 		if (i->id == id)
1349 			return i->live_child;
1350 	return 0;
1351 }
1352 
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1353 static unsigned __live_child(struct snapshot_table *t, u32 id,
1354 			     snapshot_id_list *delete_leaves,
1355 			     interior_delete_list *delete_interior)
1356 {
1357 	struct snapshot_t *s = __snapshot_t(t, id);
1358 	if (!s)
1359 		return 0;
1360 
1361 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1362 		if (s->children[i] &&
1363 		    !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1364 		    !interior_delete_has_id(delete_interior, s->children[i]))
1365 			return s->children[i];
1366 
1367 	for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1368 		u32 live_child = s->children[i]
1369 			? __live_child(t, s->children[i], delete_leaves, delete_interior)
1370 			: 0;
1371 		if (live_child)
1372 			return live_child;
1373 	}
1374 
1375 	return 0;
1376 }
1377 
live_child(struct bch_fs * c,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1378 static unsigned live_child(struct bch_fs *c, u32 id,
1379 			   snapshot_id_list *delete_leaves,
1380 			   interior_delete_list *delete_interior)
1381 {
1382 	rcu_read_lock();
1383 	u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1384 			       delete_leaves, delete_interior);
1385 	rcu_read_unlock();
1386 	return ret;
1387 }
1388 
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1389 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1390 					     struct btree_iter *iter,
1391 					     struct bkey_s_c k,
1392 					     snapshot_id_list *delete_leaves,
1393 					     interior_delete_list *delete_interior)
1394 {
1395 	if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1396 		return bch2_btree_delete_at(trans, iter,
1397 					    BTREE_UPDATE_internal_snapshot_node);
1398 
1399 	u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1400 	if (live_child) {
1401 		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1402 		int ret = PTR_ERR_OR_ZERO(new);
1403 		if (ret)
1404 			return ret;
1405 
1406 		new->k.p.snapshot = live_child;
1407 
1408 		struct btree_iter dst_iter;
1409 		struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1410 							   iter->btree_id, new->k.p,
1411 							   BTREE_ITER_all_snapshots|
1412 							   BTREE_ITER_intent);
1413 		ret = bkey_err(dst_k);
1414 		if (ret)
1415 			return ret;
1416 
1417 		ret =   (bkey_deleted(dst_k.k)
1418 			 ? bch2_trans_update(trans, &dst_iter, new,
1419 					     BTREE_UPDATE_internal_snapshot_node)
1420 			 : 0) ?:
1421 			bch2_btree_delete_at(trans, iter,
1422 					     BTREE_UPDATE_internal_snapshot_node);
1423 		bch2_trans_iter_exit(trans, &dst_iter);
1424 		return ret;
1425 	}
1426 
1427 	return 0;
1428 }
1429 
1430 /*
1431  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1432  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1433  * as deleted.
1434  */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1435 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1436 					snapshot_id_list *delete_leaves,
1437 					interior_delete_list *delete_interior)
1438 {
1439 	if (k.k->type != KEY_TYPE_snapshot)
1440 		return 0;
1441 
1442 	struct bch_fs *c = trans->c;
1443 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1444 	unsigned live_children = 0;
1445 
1446 	if (BCH_SNAPSHOT_SUBVOL(s.v))
1447 		return 0;
1448 
1449 	for (unsigned i = 0; i < 2; i++) {
1450 		u32 child = le32_to_cpu(s.v->children[i]);
1451 
1452 		live_children += child &&
1453 			!snapshot_list_has_id(delete_leaves, child);
1454 	}
1455 
1456 	if (live_children == 0) {
1457 		return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1458 	} else if (live_children == 1) {
1459 		struct snapshot_interior_delete d = {
1460 			.id		= s.k->p.offset,
1461 			.live_child	= live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1462 		};
1463 
1464 		if (!d.live_child) {
1465 			bch_err(c, "error finding live child of snapshot %u", d.id);
1466 			return -EINVAL;
1467 		}
1468 
1469 		return darray_push(delete_interior, d);
1470 	} else {
1471 		return 0;
1472 	}
1473 }
1474 
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1475 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1476 						interior_delete_list *skip)
1477 {
1478 	rcu_read_lock();
1479 	while (interior_delete_has_id(skip, id))
1480 		id = __bch2_snapshot_parent(c, id);
1481 
1482 	while (n--) {
1483 		do {
1484 			id = __bch2_snapshot_parent(c, id);
1485 		} while (interior_delete_has_id(skip, id));
1486 	}
1487 	rcu_read_unlock();
1488 
1489 	return id;
1490 }
1491 
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1492 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1493 					      struct btree_iter *iter, struct bkey_s_c k,
1494 					      interior_delete_list *deleted)
1495 {
1496 	struct bch_fs *c = trans->c;
1497 	u32 nr_deleted_ancestors = 0;
1498 	struct bkey_i_snapshot *s;
1499 	int ret;
1500 
1501 	if (k.k->type != KEY_TYPE_snapshot)
1502 		return 0;
1503 
1504 	if (interior_delete_has_id(deleted, k.k->p.offset))
1505 		return 0;
1506 
1507 	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1508 	ret = PTR_ERR_OR_ZERO(s);
1509 	if (ret)
1510 		return ret;
1511 
1512 	darray_for_each(*deleted, i)
1513 		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1514 
1515 	if (!nr_deleted_ancestors)
1516 		return 0;
1517 
1518 	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1519 
1520 	if (!s->v.depth) {
1521 		s->v.skip[0] = 0;
1522 		s->v.skip[1] = 0;
1523 		s->v.skip[2] = 0;
1524 	} else {
1525 		u32 depth = le32_to_cpu(s->v.depth);
1526 		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1527 
1528 		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1529 			u32 id = le32_to_cpu(s->v.skip[j]);
1530 
1531 			if (interior_delete_has_id(deleted, id)) {
1532 				id = bch2_snapshot_nth_parent_skip(c,
1533 							parent,
1534 							depth > 1
1535 							? get_random_u32_below(depth - 1)
1536 							: 0,
1537 							deleted);
1538 				s->v.skip[j] = cpu_to_le32(id);
1539 			}
1540 		}
1541 
1542 		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1543 	}
1544 
1545 	return bch2_trans_update(trans, iter, &s->k_i, 0);
1546 }
1547 
bch2_delete_dead_snapshots(struct bch_fs * c)1548 int bch2_delete_dead_snapshots(struct bch_fs *c)
1549 {
1550 	if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1551 		return 0;
1552 
1553 	struct btree_trans *trans = bch2_trans_get(c);
1554 	snapshot_id_list delete_leaves = {};
1555 	interior_delete_list delete_interior = {};
1556 	int ret = 0;
1557 
1558 	/*
1559 	 * For every snapshot node: If we have no live children and it's not
1560 	 * pointed to by a subvolume, delete it:
1561 	 */
1562 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1563 		check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1564 	if (!bch2_err_matches(ret, EROFS))
1565 		bch_err_msg(c, ret, "walking snapshots");
1566 	if (ret)
1567 		goto err;
1568 
1569 	if (!delete_leaves.nr && !delete_interior.nr)
1570 		goto err;
1571 
1572 	{
1573 		struct printbuf buf = PRINTBUF;
1574 		prt_printf(&buf, "deleting leaves");
1575 		darray_for_each(delete_leaves, i)
1576 			prt_printf(&buf, " %u", *i);
1577 
1578 		prt_printf(&buf, " interior");
1579 		darray_for_each(delete_interior, i)
1580 			prt_printf(&buf, " %u->%u", i->id, i->live_child);
1581 
1582 		ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1583 		printbuf_exit(&buf);
1584 		if (ret)
1585 			goto err;
1586 	}
1587 
1588 	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1589 		struct disk_reservation res = { 0 };
1590 
1591 		if (!btree_type_has_snapshots(btree))
1592 			continue;
1593 
1594 		ret = for_each_btree_key_commit(trans, iter,
1595 				btree, POS_MIN,
1596 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1597 				&res, NULL, BCH_TRANS_COMMIT_no_enospc,
1598 			delete_dead_snapshots_process_key(trans, &iter, k,
1599 							  &delete_leaves,
1600 							  &delete_interior));
1601 
1602 		bch2_disk_reservation_put(c, &res);
1603 
1604 		if (!bch2_err_matches(ret, EROFS))
1605 			bch_err_msg(c, ret, "deleting keys from dying snapshots");
1606 		if (ret)
1607 			goto err;
1608 	}
1609 
1610 	darray_for_each(delete_leaves, i) {
1611 		ret = commit_do(trans, NULL, NULL, 0,
1612 			bch2_snapshot_node_delete(trans, *i));
1613 		if (!bch2_err_matches(ret, EROFS))
1614 			bch_err_msg(c, ret, "deleting snapshot %u", *i);
1615 		if (ret)
1616 			goto err;
1617 	}
1618 
1619 	/*
1620 	 * Fixing children of deleted snapshots can't be done completely
1621 	 * atomically, if we crash between here and when we delete the interior
1622 	 * nodes some depth fields will be off:
1623 	 */
1624 	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1625 				  BTREE_ITER_intent, k,
1626 				  NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1627 		bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1628 	if (ret)
1629 		goto err;
1630 
1631 	darray_for_each(delete_interior, i) {
1632 		ret = commit_do(trans, NULL, NULL, 0,
1633 			bch2_snapshot_node_delete(trans, i->id));
1634 		if (!bch2_err_matches(ret, EROFS))
1635 			bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1636 		if (ret)
1637 			goto err;
1638 	}
1639 err:
1640 	darray_exit(&delete_interior);
1641 	darray_exit(&delete_leaves);
1642 	bch2_trans_put(trans);
1643 	if (!bch2_err_matches(ret, EROFS))
1644 		bch_err_fn(c, ret);
1645 	return ret;
1646 }
1647 
bch2_delete_dead_snapshots_work(struct work_struct * work)1648 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1649 {
1650 	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1651 
1652 	set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1653 
1654 	bch2_delete_dead_snapshots(c);
1655 	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1656 }
1657 
bch2_delete_dead_snapshots_async(struct bch_fs * c)1658 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1659 {
1660 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1661 		return;
1662 
1663 	BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1664 
1665 	if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1666 		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1667 }
1668 
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1669 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1670 				       enum btree_id id,
1671 				       struct bpos pos)
1672 {
1673 	struct bch_fs *c = trans->c;
1674 	struct btree_iter iter;
1675 	struct bkey_s_c k;
1676 	int ret;
1677 
1678 	for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1679 					     BTREE_ITER_not_extents|
1680 					     BTREE_ITER_all_snapshots,
1681 					     k, ret) {
1682 		if (!bkey_eq(pos, k.k->p))
1683 			break;
1684 
1685 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1686 			ret = 1;
1687 			break;
1688 		}
1689 	}
1690 	bch2_trans_iter_exit(trans, &iter);
1691 
1692 	return ret;
1693 }
1694 
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1695 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1696 {
1697 	/* If there's one child, it's redundant and keys will be moved to the child */
1698 	return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1699 }
1700 
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1701 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1702 {
1703 	if (k.k->type != KEY_TYPE_snapshot)
1704 		return 0;
1705 
1706 	struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1707 	if (BCH_SNAPSHOT_DELETED(snap.v) ||
1708 	    interior_snapshot_needs_delete(snap))
1709 		set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1710 
1711 	return 0;
1712 }
1713 
bch2_snapshots_read(struct bch_fs * c)1714 int bch2_snapshots_read(struct bch_fs *c)
1715 {
1716 	/*
1717 	 * Initializing the is_ancestor bitmaps requires ancestors to already be
1718 	 * initialized - so mark in reverse:
1719 	 */
1720 	int ret = bch2_trans_run(c,
1721 		for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1722 				   POS_MAX, 0, k,
1723 			__bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1724 			bch2_check_snapshot_needs_deletion(trans, k)));
1725 	bch_err_fn(c, ret);
1726 
1727 	/*
1728 	 * It's important that we check if we need to reconstruct snapshots
1729 	 * before going RW, so we mark that pass as required in the superblock -
1730 	 * otherwise, we could end up deleting keys with missing snapshot nodes
1731 	 * instead
1732 	 */
1733 	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1734 	       test_bit(BCH_FS_may_go_rw, &c->flags));
1735 
1736 	if (bch2_err_matches(ret, EIO) ||
1737 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1738 		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1739 
1740 	return ret;
1741 }
1742 
bch2_fs_snapshots_exit(struct bch_fs * c)1743 void bch2_fs_snapshots_exit(struct bch_fs *c)
1744 {
1745 	kvfree(rcu_dereference_protected(c->snapshots, true));
1746 }
1747