1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_key_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "fs.h"
12 #include "recovery_passes.h"
13 #include "snapshot.h"
14
15 #include <linux/random.h>
16
17 /*
18 * Snapshot trees:
19 *
20 * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
21 * exist to provide a stable identifier for the whole lifetime of a snapshot
22 * tree.
23 */
24
bch2_snapshot_tree_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)25 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
26 struct bkey_s_c k)
27 {
28 struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
29
30 prt_printf(out, "subvol %u root snapshot %u",
31 le32_to_cpu(t.v->master_subvol),
32 le32_to_cpu(t.v->root_snapshot));
33 }
34
bch2_snapshot_tree_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)35 int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
36 struct bkey_validate_context from)
37 {
38 int ret = 0;
39
40 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41 bkey_lt(k.k->p, POS(0, 1)),
42 c, snapshot_tree_pos_bad,
43 "bad pos");
44 fsck_err:
45 return ret;
46 }
47
bch2_snapshot_tree_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot_tree * s)48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49 struct bch_snapshot_tree *s)
50 {
51 int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52 BTREE_ITER_with_updates, snapshot_tree, s);
53
54 if (bch2_err_matches(ret, ENOENT))
55 ret = -BCH_ERR_ENOENT_snapshot_tree;
56 return ret;
57 }
58
59 struct bkey_i_snapshot_tree *
__bch2_snapshot_tree_create(struct btree_trans * trans)60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62 struct btree_iter iter;
63 int ret = bch2_bkey_get_empty_slot(trans, &iter,
64 BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65 struct bkey_i_snapshot_tree *s_t;
66
67 if (ret == -BCH_ERR_ENOSPC_btree_slot)
68 ret = -BCH_ERR_ENOSPC_snapshot_tree;
69 if (ret)
70 return ERR_PTR(ret);
71
72 s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73 ret = PTR_ERR_OR_ZERO(s_t);
74 bch2_trans_iter_exit(trans, &iter);
75 return ret ? ERR_PTR(ret) : s_t;
76 }
77
bch2_snapshot_tree_create(struct btree_trans * trans,u32 root_id,u32 subvol_id,u32 * tree_id)78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79 u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81 struct bkey_i_snapshot_tree *n_tree =
82 __bch2_snapshot_tree_create(trans);
83
84 if (IS_ERR(n_tree))
85 return PTR_ERR(n_tree);
86
87 n_tree->v.master_subvol = cpu_to_le32(subvol_id);
88 n_tree->v.root_snapshot = cpu_to_le32(root_id);
89 *tree_id = n_tree->k.p.offset;
90 return 0;
91 }
92
93 /* Snapshot nodes: */
94
__bch2_snapshot_is_ancestor_early(struct snapshot_table * t,u32 id,u32 ancestor)95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97 while (id && id < ancestor) {
98 const struct snapshot_t *s = __snapshot_t(t, id);
99 id = s ? s->parent : 0;
100 }
101 return id == ancestor;
102 }
103
bch2_snapshot_is_ancestor_early(struct bch_fs * c,u32 id,u32 ancestor)104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106 rcu_read_lock();
107 bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108 rcu_read_unlock();
109
110 return ret;
111 }
112
get_ancestor_below(struct snapshot_table * t,u32 id,u32 ancestor)113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115 const struct snapshot_t *s = __snapshot_t(t, id);
116 if (!s)
117 return 0;
118
119 if (s->skip[2] <= ancestor)
120 return s->skip[2];
121 if (s->skip[1] <= ancestor)
122 return s->skip[1];
123 if (s->skip[0] <= ancestor)
124 return s->skip[0];
125 return s->parent;
126 }
127
test_ancestor_bitmap(struct snapshot_table * t,u32 id,u32 ancestor)128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130 const struct snapshot_t *s = __snapshot_t(t, id);
131 if (!s)
132 return false;
133
134 return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136
__bch2_snapshot_is_ancestor(struct bch_fs * c,u32 id,u32 ancestor)137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139 bool ret;
140
141 rcu_read_lock();
142 struct snapshot_table *t = rcu_dereference(c->snapshots);
143
144 if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146 goto out;
147 }
148
149 if (likely(ancestor >= IS_ANCESTOR_BITMAP))
150 while (id && id < ancestor - IS_ANCESTOR_BITMAP)
151 id = get_ancestor_below(t, id, ancestor);
152
153 ret = id && id < ancestor
154 ? test_ancestor_bitmap(t, id, ancestor)
155 : id == ancestor;
156
157 EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
158 out:
159 rcu_read_unlock();
160
161 return ret;
162 }
163
__snapshot_t_mut(struct bch_fs * c,u32 id)164 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
165 {
166 size_t idx = U32_MAX - id;
167 struct snapshot_table *new, *old;
168
169 size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
170 size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
171
172 if (unlikely(new_bytes > INT_MAX))
173 return NULL;
174
175 new = kvzalloc(new_bytes, GFP_KERNEL);
176 if (!new)
177 return NULL;
178
179 new->nr = new_size;
180
181 old = rcu_dereference_protected(c->snapshots, true);
182 if (old)
183 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
184
185 rcu_assign_pointer(c->snapshots, new);
186 kvfree_rcu(old, rcu);
187
188 return &rcu_dereference_protected(c->snapshots,
189 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
190 }
191
snapshot_t_mut(struct bch_fs * c,u32 id)192 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
193 {
194 size_t idx = U32_MAX - id;
195 struct snapshot_table *table =
196 rcu_dereference_protected(c->snapshots,
197 lockdep_is_held(&c->snapshot_table_lock));
198
199 lockdep_assert_held(&c->snapshot_table_lock);
200
201 if (likely(table && idx < table->nr))
202 return &table->s[idx];
203
204 return __snapshot_t_mut(c, id);
205 }
206
bch2_snapshot_to_text(struct printbuf * out,struct bch_fs * c,struct bkey_s_c k)207 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
208 struct bkey_s_c k)
209 {
210 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
211
212 prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
213 BCH_SNAPSHOT_SUBVOL(s.v),
214 BCH_SNAPSHOT_DELETED(s.v),
215 le32_to_cpu(s.v->parent),
216 le32_to_cpu(s.v->children[0]),
217 le32_to_cpu(s.v->children[1]),
218 le32_to_cpu(s.v->subvol),
219 le32_to_cpu(s.v->tree));
220
221 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
222 prt_printf(out, " depth %u skiplist %u %u %u",
223 le32_to_cpu(s.v->depth),
224 le32_to_cpu(s.v->skip[0]),
225 le32_to_cpu(s.v->skip[1]),
226 le32_to_cpu(s.v->skip[2]));
227 }
228
bch2_snapshot_validate(struct bch_fs * c,struct bkey_s_c k,struct bkey_validate_context from)229 int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
230 struct bkey_validate_context from)
231 {
232 struct bkey_s_c_snapshot s;
233 u32 i, id;
234 int ret = 0;
235
236 bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
237 bkey_lt(k.k->p, POS(0, 1)),
238 c, snapshot_pos_bad,
239 "bad pos");
240
241 s = bkey_s_c_to_snapshot(k);
242
243 id = le32_to_cpu(s.v->parent);
244 bkey_fsck_err_on(id && id <= k.k->p.offset,
245 c, snapshot_parent_bad,
246 "bad parent node (%u <= %llu)",
247 id, k.k->p.offset);
248
249 bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
250 c, snapshot_children_not_normalized,
251 "children not normalized");
252
253 bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
254 c, snapshot_child_duplicate,
255 "duplicate child nodes");
256
257 for (i = 0; i < 2; i++) {
258 id = le32_to_cpu(s.v->children[i]);
259
260 bkey_fsck_err_on(id >= k.k->p.offset,
261 c, snapshot_child_bad,
262 "bad child node (%u >= %llu)",
263 id, k.k->p.offset);
264 }
265
266 if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
267 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
268 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
269 c, snapshot_skiplist_not_normalized,
270 "skiplist not normalized");
271
272 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
273 id = le32_to_cpu(s.v->skip[i]);
274
275 bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
276 c, snapshot_skiplist_bad,
277 "bad skiplist node %u", id);
278 }
279 }
280 fsck_err:
281 return ret;
282 }
283
__bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s_c new,enum btree_iter_update_trigger_flags flags)284 static int __bch2_mark_snapshot(struct btree_trans *trans,
285 enum btree_id btree, unsigned level,
286 struct bkey_s_c old, struct bkey_s_c new,
287 enum btree_iter_update_trigger_flags flags)
288 {
289 struct bch_fs *c = trans->c;
290 struct snapshot_t *t;
291 u32 id = new.k->p.offset;
292 int ret = 0;
293
294 mutex_lock(&c->snapshot_table_lock);
295
296 t = snapshot_t_mut(c, id);
297 if (!t) {
298 ret = -BCH_ERR_ENOMEM_mark_snapshot;
299 goto err;
300 }
301
302 if (new.k->type == KEY_TYPE_snapshot) {
303 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
304
305 t->live = true;
306 t->parent = le32_to_cpu(s.v->parent);
307 t->children[0] = le32_to_cpu(s.v->children[0]);
308 t->children[1] = le32_to_cpu(s.v->children[1]);
309 t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
310 t->tree = le32_to_cpu(s.v->tree);
311
312 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
313 t->depth = le32_to_cpu(s.v->depth);
314 t->skip[0] = le32_to_cpu(s.v->skip[0]);
315 t->skip[1] = le32_to_cpu(s.v->skip[1]);
316 t->skip[2] = le32_to_cpu(s.v->skip[2]);
317 } else {
318 t->depth = 0;
319 t->skip[0] = 0;
320 t->skip[1] = 0;
321 t->skip[2] = 0;
322 }
323
324 u32 parent = id;
325
326 while ((parent = bch2_snapshot_parent_early(c, parent)) &&
327 parent - id - 1 < IS_ANCESTOR_BITMAP)
328 __set_bit(parent - id - 1, t->is_ancestor);
329
330 if (BCH_SNAPSHOT_DELETED(s.v)) {
331 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
332 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
333 bch2_delete_dead_snapshots_async(c);
334 }
335 } else {
336 memset(t, 0, sizeof(*t));
337 }
338 err:
339 mutex_unlock(&c->snapshot_table_lock);
340 return ret;
341 }
342
bch2_mark_snapshot(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)343 int bch2_mark_snapshot(struct btree_trans *trans,
344 enum btree_id btree, unsigned level,
345 struct bkey_s_c old, struct bkey_s new,
346 enum btree_iter_update_trigger_flags flags)
347 {
348 return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
349 }
350
bch2_snapshot_lookup(struct btree_trans * trans,u32 id,struct bch_snapshot * s)351 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
352 struct bch_snapshot *s)
353 {
354 return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
355 BTREE_ITER_with_updates, snapshot, s);
356 }
357
358 /* fsck: */
359
bch2_snapshot_child(struct bch_fs * c,u32 id,unsigned child)360 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
361 {
362 return snapshot_t(c, id)->children[child];
363 }
364
bch2_snapshot_left_child(struct bch_fs * c,u32 id)365 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
366 {
367 return bch2_snapshot_child(c, id, 0);
368 }
369
bch2_snapshot_right_child(struct bch_fs * c,u32 id)370 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
371 {
372 return bch2_snapshot_child(c, id, 1);
373 }
374
bch2_snapshot_tree_next(struct bch_fs * c,u32 id)375 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
376 {
377 u32 n, parent;
378
379 n = bch2_snapshot_left_child(c, id);
380 if (n)
381 return n;
382
383 while ((parent = bch2_snapshot_parent(c, id))) {
384 n = bch2_snapshot_right_child(c, parent);
385 if (n && n != id)
386 return n;
387 id = parent;
388 }
389
390 return 0;
391 }
392
bch2_snapshot_tree_oldest_subvol(struct bch_fs * c,u32 snapshot_root)393 u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
394 {
395 u32 id = snapshot_root;
396 u32 subvol = 0, s;
397
398 rcu_read_lock();
399 while (id && bch2_snapshot_exists(c, id)) {
400 s = snapshot_t(c, id)->subvol;
401
402 if (s && (!subvol || s < subvol))
403 subvol = s;
404
405 id = bch2_snapshot_tree_next(c, id);
406 }
407 rcu_read_unlock();
408
409 return subvol;
410 }
411
bch2_snapshot_tree_master_subvol(struct btree_trans * trans,u32 snapshot_root,u32 * subvol_id)412 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
413 u32 snapshot_root, u32 *subvol_id)
414 {
415 struct bch_fs *c = trans->c;
416 struct btree_iter iter;
417 struct bkey_s_c k;
418 bool found = false;
419 int ret;
420
421 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
422 0, k, ret) {
423 if (k.k->type != KEY_TYPE_subvolume)
424 continue;
425
426 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
427 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
428 continue;
429 if (!BCH_SUBVOLUME_SNAP(s.v)) {
430 *subvol_id = s.k->p.offset;
431 found = true;
432 break;
433 }
434 }
435 bch2_trans_iter_exit(trans, &iter);
436
437 if (!ret && !found) {
438 struct bkey_i_subvolume *u;
439
440 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
441
442 u = bch2_bkey_get_mut_typed(trans, &iter,
443 BTREE_ID_subvolumes, POS(0, *subvol_id),
444 0, subvolume);
445 ret = PTR_ERR_OR_ZERO(u);
446 if (ret)
447 return ret;
448
449 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
450 }
451
452 return ret;
453 }
454
check_snapshot_tree(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)455 static int check_snapshot_tree(struct btree_trans *trans,
456 struct btree_iter *iter,
457 struct bkey_s_c k)
458 {
459 struct bch_fs *c = trans->c;
460 struct bkey_s_c_snapshot_tree st;
461 struct bch_snapshot s;
462 struct bch_subvolume subvol;
463 struct printbuf buf = PRINTBUF;
464 struct btree_iter snapshot_iter = {};
465 u32 root_id;
466 int ret;
467
468 if (k.k->type != KEY_TYPE_snapshot_tree)
469 return 0;
470
471 st = bkey_s_c_to_snapshot_tree(k);
472 root_id = le32_to_cpu(st.v->root_snapshot);
473
474 struct bkey_s_c_snapshot snapshot_k =
475 bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots,
476 POS(0, root_id), 0, snapshot);
477 ret = bkey_err(snapshot_k);
478 if (ret && !bch2_err_matches(ret, ENOENT))
479 goto err;
480
481 if (!ret)
482 bkey_val_copy(&s, snapshot_k);
483
484 if (fsck_err_on(ret ||
485 root_id != bch2_snapshot_root(c, root_id) ||
486 st.k->p.offset != le32_to_cpu(s.tree),
487 trans, snapshot_tree_to_missing_snapshot,
488 "snapshot tree points to missing/incorrect snapshot:\n%s",
489 (bch2_bkey_val_to_text(&buf, c, st.s_c),
490 prt_newline(&buf),
491 ret
492 ? prt_printf(&buf, "(%s)", bch2_err_str(ret))
493 : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c),
494 buf.buf))) {
495 ret = bch2_btree_delete_at(trans, iter, 0);
496 goto err;
497 }
498
499 if (!st.v->master_subvol)
500 goto out;
501
502 ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol);
503 if (ret && !bch2_err_matches(ret, ENOENT))
504 goto err;
505
506 if (fsck_err_on(ret,
507 trans, snapshot_tree_to_missing_subvol,
508 "snapshot tree points to missing subvolume:\n%s",
509 (printbuf_reset(&buf),
510 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
511 fsck_err_on(!bch2_snapshot_is_ancestor(c,
512 le32_to_cpu(subvol.snapshot),
513 root_id),
514 trans, snapshot_tree_to_wrong_subvol,
515 "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
516 (printbuf_reset(&buf),
517 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
518 fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
519 trans, snapshot_tree_to_snapshot_subvol,
520 "snapshot tree points to snapshot subvolume:\n%s",
521 (printbuf_reset(&buf),
522 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
523 struct bkey_i_snapshot_tree *u;
524 u32 subvol_id;
525
526 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
527 bch_err_fn(c, ret);
528
529 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
530 ret = 0;
531 goto err;
532 }
533
534 if (ret)
535 goto err;
536
537 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
538 ret = PTR_ERR_OR_ZERO(u);
539 if (ret)
540 goto err;
541
542 u->v.master_subvol = cpu_to_le32(subvol_id);
543 st = snapshot_tree_i_to_s_c(u);
544 }
545 out:
546 err:
547 fsck_err:
548 bch2_trans_iter_exit(trans, &snapshot_iter);
549 printbuf_exit(&buf);
550 return ret;
551 }
552
553 /*
554 * For each snapshot_tree, make sure it points to the root of a snapshot tree
555 * and that snapshot entry points back to it, or delete it.
556 *
557 * And, make sure it points to a subvolume within that snapshot tree, or correct
558 * it to point to the oldest subvolume within that snapshot tree.
559 */
bch2_check_snapshot_trees(struct bch_fs * c)560 int bch2_check_snapshot_trees(struct bch_fs *c)
561 {
562 int ret = bch2_trans_run(c,
563 for_each_btree_key_commit(trans, iter,
564 BTREE_ID_snapshot_trees, POS_MIN,
565 BTREE_ITER_prefetch, k,
566 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
567 check_snapshot_tree(trans, &iter, k)));
568 bch_err_fn(c, ret);
569 return ret;
570 }
571
572 /*
573 * Look up snapshot tree for @tree_id and find root,
574 * make sure @snap_id is a descendent:
575 */
snapshot_tree_ptr_good(struct btree_trans * trans,u32 snap_id,u32 tree_id)576 static int snapshot_tree_ptr_good(struct btree_trans *trans,
577 u32 snap_id, u32 tree_id)
578 {
579 struct bch_snapshot_tree s_t;
580 int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
581
582 if (bch2_err_matches(ret, ENOENT))
583 return 0;
584 if (ret)
585 return ret;
586
587 return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
588 }
589
bch2_snapshot_skiplist_get(struct bch_fs * c,u32 id)590 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
591 {
592 const struct snapshot_t *s;
593
594 if (!id)
595 return 0;
596
597 rcu_read_lock();
598 s = snapshot_t(c, id);
599 if (s->parent)
600 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
601 rcu_read_unlock();
602
603 return id;
604 }
605
snapshot_skiplist_good(struct btree_trans * trans,u32 id,struct bch_snapshot s)606 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
607 {
608 unsigned i;
609
610 for (i = 0; i < 3; i++)
611 if (!s.parent) {
612 if (s.skip[i])
613 return false;
614 } else {
615 if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
616 return false;
617 }
618
619 return true;
620 }
621
622 /*
623 * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
624 * its snapshot_tree pointer is correct (allocate new one if necessary), then
625 * update this node's pointer to root node's pointer:
626 */
snapshot_tree_ptr_repair(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_snapshot * s)627 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
628 struct btree_iter *iter,
629 struct bkey_s_c k,
630 struct bch_snapshot *s)
631 {
632 struct bch_fs *c = trans->c;
633 struct btree_iter root_iter;
634 struct bch_snapshot_tree s_t;
635 struct bkey_s_c_snapshot root;
636 struct bkey_i_snapshot *u;
637 u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
638 int ret;
639
640 root = bch2_bkey_get_iter_typed(trans, &root_iter,
641 BTREE_ID_snapshots, POS(0, root_id),
642 BTREE_ITER_with_updates, snapshot);
643 ret = bkey_err(root);
644 if (ret)
645 goto err;
646
647 tree_id = le32_to_cpu(root.v->tree);
648
649 ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
650 if (ret && !bch2_err_matches(ret, ENOENT))
651 return ret;
652
653 if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
654 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
655 ret = PTR_ERR_OR_ZERO(u) ?:
656 bch2_snapshot_tree_create(trans, root_id,
657 bch2_snapshot_tree_oldest_subvol(c, root_id),
658 &tree_id);
659 if (ret)
660 goto err;
661
662 u->v.tree = cpu_to_le32(tree_id);
663 if (k.k->p.offset == root_id)
664 *s = u->v;
665 }
666
667 if (k.k->p.offset != root_id) {
668 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
669 ret = PTR_ERR_OR_ZERO(u);
670 if (ret)
671 goto err;
672
673 u->v.tree = cpu_to_le32(tree_id);
674 *s = u->v;
675 }
676 err:
677 bch2_trans_iter_exit(trans, &root_iter);
678 return ret;
679 }
680
check_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)681 static int check_snapshot(struct btree_trans *trans,
682 struct btree_iter *iter,
683 struct bkey_s_c k)
684 {
685 struct bch_fs *c = trans->c;
686 struct bch_snapshot s;
687 struct bch_subvolume subvol;
688 struct bch_snapshot v;
689 struct bkey_i_snapshot *u;
690 u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
691 u32 real_depth;
692 struct printbuf buf = PRINTBUF;
693 u32 i, id;
694 int ret = 0;
695
696 if (k.k->type != KEY_TYPE_snapshot)
697 return 0;
698
699 memset(&s, 0, sizeof(s));
700 memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
701
702 id = le32_to_cpu(s.parent);
703 if (id) {
704 ret = bch2_snapshot_lookup(trans, id, &v);
705 if (bch2_err_matches(ret, ENOENT))
706 bch_err(c, "snapshot with nonexistent parent:\n %s",
707 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
708 if (ret)
709 goto err;
710
711 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
712 le32_to_cpu(v.children[1]) != k.k->p.offset) {
713 bch_err(c, "snapshot parent %u missing pointer to child %llu",
714 id, k.k->p.offset);
715 ret = -EINVAL;
716 goto err;
717 }
718 }
719
720 for (i = 0; i < 2 && s.children[i]; i++) {
721 id = le32_to_cpu(s.children[i]);
722
723 ret = bch2_snapshot_lookup(trans, id, &v);
724 if (bch2_err_matches(ret, ENOENT))
725 bch_err(c, "snapshot node %llu has nonexistent child %u",
726 k.k->p.offset, id);
727 if (ret)
728 goto err;
729
730 if (le32_to_cpu(v.parent) != k.k->p.offset) {
731 bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
732 id, le32_to_cpu(v.parent), k.k->p.offset);
733 ret = -EINVAL;
734 goto err;
735 }
736 }
737
738 bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
739 !BCH_SNAPSHOT_DELETED(&s);
740
741 if (should_have_subvol) {
742 id = le32_to_cpu(s.subvol);
743 ret = bch2_subvolume_get(trans, id, false, &subvol);
744 if (bch2_err_matches(ret, ENOENT))
745 bch_err(c, "snapshot points to nonexistent subvolume:\n %s",
746 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
747 if (ret)
748 goto err;
749
750 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
751 bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
752 k.k->p.offset);
753 ret = -EINVAL;
754 goto err;
755 }
756 } else {
757 if (fsck_err_on(s.subvol,
758 trans, snapshot_should_not_have_subvol,
759 "snapshot should not point to subvol:\n%s",
760 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
761 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
762 ret = PTR_ERR_OR_ZERO(u);
763 if (ret)
764 goto err;
765
766 u->v.subvol = 0;
767 s = u->v;
768 }
769 }
770
771 ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
772 if (ret < 0)
773 goto err;
774
775 if (fsck_err_on(!ret,
776 trans, snapshot_to_bad_snapshot_tree,
777 "snapshot points to missing/incorrect tree:\n%s",
778 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
779 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
780 if (ret)
781 goto err;
782 }
783 ret = 0;
784
785 real_depth = bch2_snapshot_depth(c, parent_id);
786
787 if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
788 trans, snapshot_bad_depth,
789 "snapshot with incorrect depth field, should be %u:\n%s",
790 real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
791 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
792 ret = PTR_ERR_OR_ZERO(u);
793 if (ret)
794 goto err;
795
796 u->v.depth = cpu_to_le32(real_depth);
797 s = u->v;
798 }
799
800 ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
801 if (ret < 0)
802 goto err;
803
804 if (fsck_err_on(!ret,
805 trans, snapshot_bad_skiplist,
806 "snapshot with bad skiplist field:\n%s",
807 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
809 ret = PTR_ERR_OR_ZERO(u);
810 if (ret)
811 goto err;
812
813 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
814 u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
815
816 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
817 s = u->v;
818 }
819 ret = 0;
820 err:
821 fsck_err:
822 printbuf_exit(&buf);
823 return ret;
824 }
825
bch2_check_snapshots(struct bch_fs * c)826 int bch2_check_snapshots(struct bch_fs *c)
827 {
828 /*
829 * We iterate backwards as checking/fixing the depth field requires that
830 * the parent's depth already be correct:
831 */
832 int ret = bch2_trans_run(c,
833 for_each_btree_key_reverse_commit(trans, iter,
834 BTREE_ID_snapshots, POS_MAX,
835 BTREE_ITER_prefetch, k,
836 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
837 check_snapshot(trans, &iter, k)));
838 bch_err_fn(c, ret);
839 return ret;
840 }
841
check_snapshot_exists(struct btree_trans * trans,u32 id)842 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
843 {
844 struct bch_fs *c = trans->c;
845
846 /* Do we need to reconstruct the snapshot_tree entry as well? */
847 struct btree_iter iter;
848 struct bkey_s_c k;
849 int ret = 0;
850 u32 tree_id = 0;
851
852 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
853 0, k, ret) {
854 if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
855 tree_id = k.k->p.offset;
856 break;
857 }
858 }
859 bch2_trans_iter_exit(trans, &iter);
860
861 if (ret)
862 return ret;
863
864 if (!tree_id) {
865 ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
866 if (ret)
867 return ret;
868 }
869
870 struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
871 ret = PTR_ERR_OR_ZERO(snapshot);
872 if (ret)
873 return ret;
874
875 bkey_snapshot_init(&snapshot->k_i);
876 snapshot->k.p = POS(0, id);
877 snapshot->v.tree = cpu_to_le32(tree_id);
878 snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
879
880 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
881 0, k, ret) {
882 if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
883 snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
884 SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
885 break;
886 }
887 }
888 bch2_trans_iter_exit(trans, &iter);
889
890 return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
891 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
892 bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
893 }
894
895 /* Figure out which snapshot nodes belong in the same tree: */
896 struct snapshot_tree_reconstruct {
897 enum btree_id btree;
898 struct bpos cur_pos;
899 snapshot_id_list cur_ids;
900 DARRAY(snapshot_id_list) trees;
901 };
902
snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct * r)903 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
904 {
905 darray_for_each(r->trees, i)
906 darray_exit(i);
907 darray_exit(&r->trees);
908 darray_exit(&r->cur_ids);
909 }
910
same_snapshot(struct snapshot_tree_reconstruct * r,struct bpos pos)911 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
912 {
913 return r->btree == BTREE_ID_inodes
914 ? r->cur_pos.offset == pos.offset
915 : r->cur_pos.inode == pos.inode;
916 }
917
snapshot_id_lists_have_common(snapshot_id_list * l,snapshot_id_list * r)918 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
919 {
920 darray_for_each(*l, i)
921 if (snapshot_list_has_id(r, *i))
922 return true;
923 return false;
924 }
925
snapshot_id_list_to_text(struct printbuf * out,snapshot_id_list * s)926 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
927 {
928 bool first = true;
929 darray_for_each(*s, i) {
930 if (!first)
931 prt_char(out, ' ');
932 first = false;
933 prt_printf(out, "%u", *i);
934 }
935 }
936
snapshot_tree_reconstruct_next(struct bch_fs * c,struct snapshot_tree_reconstruct * r)937 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
938 {
939 if (r->cur_ids.nr) {
940 darray_for_each(r->trees, i)
941 if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
942 int ret = snapshot_list_merge(c, i, &r->cur_ids);
943 if (ret)
944 return ret;
945 goto out;
946 }
947 darray_push(&r->trees, r->cur_ids);
948 darray_init(&r->cur_ids);
949 }
950 out:
951 r->cur_ids.nr = 0;
952 return 0;
953 }
954
get_snapshot_trees(struct bch_fs * c,struct snapshot_tree_reconstruct * r,struct bpos pos)955 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
956 {
957 if (!same_snapshot(r, pos))
958 snapshot_tree_reconstruct_next(c, r);
959 r->cur_pos = pos;
960 return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
961 }
962
bch2_reconstruct_snapshots(struct bch_fs * c)963 int bch2_reconstruct_snapshots(struct bch_fs *c)
964 {
965 struct btree_trans *trans = bch2_trans_get(c);
966 struct printbuf buf = PRINTBUF;
967 struct snapshot_tree_reconstruct r = {};
968 int ret = 0;
969
970 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
971 if (btree_type_has_snapshots(btree)) {
972 r.btree = btree;
973
974 ret = for_each_btree_key(trans, iter, btree, POS_MIN,
975 BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({
976 get_snapshot_trees(c, &r, k.k->p);
977 }));
978 if (ret)
979 goto err;
980
981 snapshot_tree_reconstruct_next(c, &r);
982 }
983 }
984
985 darray_for_each(r.trees, t) {
986 printbuf_reset(&buf);
987 snapshot_id_list_to_text(&buf, t);
988
989 darray_for_each(*t, id) {
990 if (fsck_err_on(!bch2_snapshot_exists(c, *id),
991 trans, snapshot_node_missing,
992 "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
993 if (t->nr > 1) {
994 bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
995 ret = -BCH_ERR_fsck_repair_unimplemented;
996 goto err;
997 }
998
999 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1000 check_snapshot_exists(trans, *id));
1001 if (ret)
1002 goto err;
1003 }
1004 }
1005 }
1006 fsck_err:
1007 err:
1008 bch2_trans_put(trans);
1009 snapshot_tree_reconstruct_exit(&r);
1010 printbuf_exit(&buf);
1011 bch_err_fn(c, ret);
1012 return ret;
1013 }
1014
bch2_check_key_has_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1015 int bch2_check_key_has_snapshot(struct btree_trans *trans,
1016 struct btree_iter *iter,
1017 struct bkey_s_c k)
1018 {
1019 struct bch_fs *c = trans->c;
1020 struct printbuf buf = PRINTBUF;
1021 int ret = 0;
1022
1023 if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
1024 trans, bkey_in_missing_snapshot,
1025 "key in missing snapshot %s, delete?",
1026 (bch2_btree_id_to_text(&buf, iter->btree_id),
1027 prt_char(&buf, ' '),
1028 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1029 ret = bch2_btree_delete_at(trans, iter,
1030 BTREE_UPDATE_internal_snapshot_node) ?: 1;
1031 fsck_err:
1032 printbuf_exit(&buf);
1033 return ret;
1034 }
1035
1036 /*
1037 * Mark a snapshot as deleted, for future cleanup:
1038 */
bch2_snapshot_node_set_deleted(struct btree_trans * trans,u32 id)1039 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1040 {
1041 struct btree_iter iter;
1042 struct bkey_i_snapshot *s =
1043 bch2_bkey_get_mut_typed(trans, &iter,
1044 BTREE_ID_snapshots, POS(0, id),
1045 0, snapshot);
1046 int ret = PTR_ERR_OR_ZERO(s);
1047 if (unlikely(ret)) {
1048 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1049 trans->c, "missing snapshot %u", id);
1050 return ret;
1051 }
1052
1053 /* already deleted? */
1054 if (BCH_SNAPSHOT_DELETED(&s->v))
1055 goto err;
1056
1057 SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1058 SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1059 s->v.subvol = 0;
1060 err:
1061 bch2_trans_iter_exit(trans, &iter);
1062 return ret;
1063 }
1064
normalize_snapshot_child_pointers(struct bch_snapshot * s)1065 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1066 {
1067 if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1068 swap(s->children[0], s->children[1]);
1069 }
1070
bch2_snapshot_node_delete(struct btree_trans * trans,u32 id)1071 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1072 {
1073 struct bch_fs *c = trans->c;
1074 struct btree_iter iter, p_iter = {};
1075 struct btree_iter c_iter = {};
1076 struct btree_iter tree_iter = {};
1077 struct bkey_s_c_snapshot s;
1078 u32 parent_id, child_id;
1079 unsigned i;
1080 int ret = 0;
1081
1082 s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1083 BTREE_ITER_intent, snapshot);
1084 ret = bkey_err(s);
1085 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1086 "missing snapshot %u", id);
1087
1088 if (ret)
1089 goto err;
1090
1091 BUG_ON(s.v->children[1]);
1092
1093 parent_id = le32_to_cpu(s.v->parent);
1094 child_id = le32_to_cpu(s.v->children[0]);
1095
1096 if (parent_id) {
1097 struct bkey_i_snapshot *parent;
1098
1099 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1100 BTREE_ID_snapshots, POS(0, parent_id),
1101 0, snapshot);
1102 ret = PTR_ERR_OR_ZERO(parent);
1103 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1104 "missing snapshot %u", parent_id);
1105 if (unlikely(ret))
1106 goto err;
1107
1108 /* find entry in parent->children for node being deleted */
1109 for (i = 0; i < 2; i++)
1110 if (le32_to_cpu(parent->v.children[i]) == id)
1111 break;
1112
1113 if (bch2_fs_inconsistent_on(i == 2, c,
1114 "snapshot %u missing child pointer to %u",
1115 parent_id, id))
1116 goto err;
1117
1118 parent->v.children[i] = cpu_to_le32(child_id);
1119
1120 normalize_snapshot_child_pointers(&parent->v);
1121 }
1122
1123 if (child_id) {
1124 struct bkey_i_snapshot *child;
1125
1126 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1127 BTREE_ID_snapshots, POS(0, child_id),
1128 0, snapshot);
1129 ret = PTR_ERR_OR_ZERO(child);
1130 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1131 "missing snapshot %u", child_id);
1132 if (unlikely(ret))
1133 goto err;
1134
1135 child->v.parent = cpu_to_le32(parent_id);
1136
1137 if (!child->v.parent) {
1138 child->v.skip[0] = 0;
1139 child->v.skip[1] = 0;
1140 child->v.skip[2] = 0;
1141 }
1142 }
1143
1144 if (!parent_id) {
1145 /*
1146 * We're deleting the root of a snapshot tree: update the
1147 * snapshot_tree entry to point to the new root, or delete it if
1148 * this is the last snapshot ID in this tree:
1149 */
1150 struct bkey_i_snapshot_tree *s_t;
1151
1152 BUG_ON(s.v->children[1]);
1153
1154 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1155 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1156 0, snapshot_tree);
1157 ret = PTR_ERR_OR_ZERO(s_t);
1158 if (ret)
1159 goto err;
1160
1161 if (s.v->children[0]) {
1162 s_t->v.root_snapshot = s.v->children[0];
1163 } else {
1164 s_t->k.type = KEY_TYPE_deleted;
1165 set_bkey_val_u64s(&s_t->k, 0);
1166 }
1167 }
1168
1169 ret = bch2_btree_delete_at(trans, &iter, 0);
1170 err:
1171 bch2_trans_iter_exit(trans, &tree_iter);
1172 bch2_trans_iter_exit(trans, &p_iter);
1173 bch2_trans_iter_exit(trans, &c_iter);
1174 bch2_trans_iter_exit(trans, &iter);
1175 return ret;
1176 }
1177
create_snapids(struct btree_trans * trans,u32 parent,u32 tree,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1178 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1179 u32 *new_snapids,
1180 u32 *snapshot_subvols,
1181 unsigned nr_snapids)
1182 {
1183 struct bch_fs *c = trans->c;
1184 struct btree_iter iter;
1185 struct bkey_i_snapshot *n;
1186 struct bkey_s_c k;
1187 unsigned i, j;
1188 u32 depth = bch2_snapshot_depth(c, parent);
1189 int ret;
1190
1191 bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1192 POS_MIN, BTREE_ITER_intent);
1193 k = bch2_btree_iter_peek(trans, &iter);
1194 ret = bkey_err(k);
1195 if (ret)
1196 goto err;
1197
1198 for (i = 0; i < nr_snapids; i++) {
1199 k = bch2_btree_iter_prev_slot(trans, &iter);
1200 ret = bkey_err(k);
1201 if (ret)
1202 goto err;
1203
1204 if (!k.k || !k.k->p.offset) {
1205 ret = -BCH_ERR_ENOSPC_snapshot_create;
1206 goto err;
1207 }
1208
1209 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1210 ret = PTR_ERR_OR_ZERO(n);
1211 if (ret)
1212 goto err;
1213
1214 n->v.flags = 0;
1215 n->v.parent = cpu_to_le32(parent);
1216 n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
1217 n->v.tree = cpu_to_le32(tree);
1218 n->v.depth = cpu_to_le32(depth);
1219 n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
1220 n->v.btime.hi = 0;
1221
1222 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1223 n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1224
1225 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1226 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1227
1228 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1229 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1230 if (ret)
1231 goto err;
1232
1233 new_snapids[i] = iter.pos.offset;
1234 }
1235 err:
1236 bch2_trans_iter_exit(trans, &iter);
1237 return ret;
1238 }
1239
1240 /*
1241 * Create new snapshot IDs as children of an existing snapshot ID:
1242 */
bch2_snapshot_node_create_children(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1243 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1244 u32 *new_snapids,
1245 u32 *snapshot_subvols,
1246 unsigned nr_snapids)
1247 {
1248 struct btree_iter iter;
1249 struct bkey_i_snapshot *n_parent;
1250 int ret = 0;
1251
1252 n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1253 BTREE_ID_snapshots, POS(0, parent),
1254 0, snapshot);
1255 ret = PTR_ERR_OR_ZERO(n_parent);
1256 if (unlikely(ret)) {
1257 if (bch2_err_matches(ret, ENOENT))
1258 bch_err(trans->c, "snapshot %u not found", parent);
1259 return ret;
1260 }
1261
1262 if (n_parent->v.children[0] || n_parent->v.children[1]) {
1263 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1264 ret = -EINVAL;
1265 goto err;
1266 }
1267
1268 ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1269 new_snapids, snapshot_subvols, nr_snapids);
1270 if (ret)
1271 goto err;
1272
1273 n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1274 n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1275 n_parent->v.subvol = 0;
1276 SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1277 err:
1278 bch2_trans_iter_exit(trans, &iter);
1279 return ret;
1280 }
1281
1282 /*
1283 * Create a snapshot node that is the root of a new tree:
1284 */
bch2_snapshot_node_create_tree(struct btree_trans * trans,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1285 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1286 u32 *new_snapids,
1287 u32 *snapshot_subvols,
1288 unsigned nr_snapids)
1289 {
1290 struct bkey_i_snapshot_tree *n_tree;
1291 int ret;
1292
1293 n_tree = __bch2_snapshot_tree_create(trans);
1294 ret = PTR_ERR_OR_ZERO(n_tree) ?:
1295 create_snapids(trans, 0, n_tree->k.p.offset,
1296 new_snapids, snapshot_subvols, nr_snapids);
1297 if (ret)
1298 return ret;
1299
1300 n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1301 n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1302 return 0;
1303 }
1304
bch2_snapshot_node_create(struct btree_trans * trans,u32 parent,u32 * new_snapids,u32 * snapshot_subvols,unsigned nr_snapids)1305 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1306 u32 *new_snapids,
1307 u32 *snapshot_subvols,
1308 unsigned nr_snapids)
1309 {
1310 BUG_ON((parent == 0) != (nr_snapids == 1));
1311 BUG_ON((parent != 0) != (nr_snapids == 2));
1312
1313 return parent
1314 ? bch2_snapshot_node_create_children(trans, parent,
1315 new_snapids, snapshot_subvols, nr_snapids)
1316 : bch2_snapshot_node_create_tree(trans,
1317 new_snapids, snapshot_subvols, nr_snapids);
1318
1319 }
1320
1321 /*
1322 * If we have an unlinked inode in an internal snapshot node, and the inode
1323 * really has been deleted in all child snapshots, how does this get cleaned up?
1324 *
1325 * first there is the problem of how keys that have been overwritten in all
1326 * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1327 * special?
1328 *
1329 * also: unlinked inode in internal snapshot appears to not be getting deleted
1330 * correctly if inode doesn't exist in leaf snapshots
1331 *
1332 * solution:
1333 *
1334 * for a key in an interior snapshot node that needs work to be done that
1335 * requires it to be mutated: iterate over all descendent leaf nodes and copy
1336 * that key to snapshot leaf nodes, where we can mutate it
1337 */
1338
1339 struct snapshot_interior_delete {
1340 u32 id;
1341 u32 live_child;
1342 };
1343 typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
1344
interior_delete_has_id(interior_delete_list * l,u32 id)1345 static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
1346 {
1347 darray_for_each(*l, i)
1348 if (i->id == id)
1349 return i->live_child;
1350 return 0;
1351 }
1352
__live_child(struct snapshot_table * t,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1353 static unsigned __live_child(struct snapshot_table *t, u32 id,
1354 snapshot_id_list *delete_leaves,
1355 interior_delete_list *delete_interior)
1356 {
1357 struct snapshot_t *s = __snapshot_t(t, id);
1358 if (!s)
1359 return 0;
1360
1361 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++)
1362 if (s->children[i] &&
1363 !snapshot_list_has_id(delete_leaves, s->children[i]) &&
1364 !interior_delete_has_id(delete_interior, s->children[i]))
1365 return s->children[i];
1366
1367 for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) {
1368 u32 live_child = s->children[i]
1369 ? __live_child(t, s->children[i], delete_leaves, delete_interior)
1370 : 0;
1371 if (live_child)
1372 return live_child;
1373 }
1374
1375 return 0;
1376 }
1377
live_child(struct bch_fs * c,u32 id,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1378 static unsigned live_child(struct bch_fs *c, u32 id,
1379 snapshot_id_list *delete_leaves,
1380 interior_delete_list *delete_interior)
1381 {
1382 rcu_read_lock();
1383 u32 ret = __live_child(rcu_dereference(c->snapshots), id,
1384 delete_leaves, delete_interior);
1385 rcu_read_unlock();
1386 return ret;
1387 }
1388
delete_dead_snapshots_process_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1389 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
1390 struct btree_iter *iter,
1391 struct bkey_s_c k,
1392 snapshot_id_list *delete_leaves,
1393 interior_delete_list *delete_interior)
1394 {
1395 if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
1396 return bch2_btree_delete_at(trans, iter,
1397 BTREE_UPDATE_internal_snapshot_node);
1398
1399 u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
1400 if (live_child) {
1401 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1402 int ret = PTR_ERR_OR_ZERO(new);
1403 if (ret)
1404 return ret;
1405
1406 new->k.p.snapshot = live_child;
1407
1408 struct btree_iter dst_iter;
1409 struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter,
1410 iter->btree_id, new->k.p,
1411 BTREE_ITER_all_snapshots|
1412 BTREE_ITER_intent);
1413 ret = bkey_err(dst_k);
1414 if (ret)
1415 return ret;
1416
1417 ret = (bkey_deleted(dst_k.k)
1418 ? bch2_trans_update(trans, &dst_iter, new,
1419 BTREE_UPDATE_internal_snapshot_node)
1420 : 0) ?:
1421 bch2_btree_delete_at(trans, iter,
1422 BTREE_UPDATE_internal_snapshot_node);
1423 bch2_trans_iter_exit(trans, &dst_iter);
1424 return ret;
1425 }
1426
1427 return 0;
1428 }
1429
1430 /*
1431 * For a given snapshot, if it doesn't have a subvolume that points to it, and
1432 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1433 * as deleted.
1434 */
check_should_delete_snapshot(struct btree_trans * trans,struct bkey_s_c k,snapshot_id_list * delete_leaves,interior_delete_list * delete_interior)1435 static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
1436 snapshot_id_list *delete_leaves,
1437 interior_delete_list *delete_interior)
1438 {
1439 if (k.k->type != KEY_TYPE_snapshot)
1440 return 0;
1441
1442 struct bch_fs *c = trans->c;
1443 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
1444 unsigned live_children = 0;
1445
1446 if (BCH_SNAPSHOT_SUBVOL(s.v))
1447 return 0;
1448
1449 for (unsigned i = 0; i < 2; i++) {
1450 u32 child = le32_to_cpu(s.v->children[i]);
1451
1452 live_children += child &&
1453 !snapshot_list_has_id(delete_leaves, child);
1454 }
1455
1456 if (live_children == 0) {
1457 return snapshot_list_add(c, delete_leaves, s.k->p.offset);
1458 } else if (live_children == 1) {
1459 struct snapshot_interior_delete d = {
1460 .id = s.k->p.offset,
1461 .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
1462 };
1463
1464 if (!d.live_child) {
1465 bch_err(c, "error finding live child of snapshot %u", d.id);
1466 return -EINVAL;
1467 }
1468
1469 return darray_push(delete_interior, d);
1470 } else {
1471 return 0;
1472 }
1473 }
1474
bch2_snapshot_nth_parent_skip(struct bch_fs * c,u32 id,u32 n,interior_delete_list * skip)1475 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1476 interior_delete_list *skip)
1477 {
1478 rcu_read_lock();
1479 while (interior_delete_has_id(skip, id))
1480 id = __bch2_snapshot_parent(c, id);
1481
1482 while (n--) {
1483 do {
1484 id = __bch2_snapshot_parent(c, id);
1485 } while (interior_delete_has_id(skip, id));
1486 }
1487 rcu_read_unlock();
1488
1489 return id;
1490 }
1491
bch2_fix_child_of_deleted_snapshot(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,interior_delete_list * deleted)1492 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1493 struct btree_iter *iter, struct bkey_s_c k,
1494 interior_delete_list *deleted)
1495 {
1496 struct bch_fs *c = trans->c;
1497 u32 nr_deleted_ancestors = 0;
1498 struct bkey_i_snapshot *s;
1499 int ret;
1500
1501 if (k.k->type != KEY_TYPE_snapshot)
1502 return 0;
1503
1504 if (interior_delete_has_id(deleted, k.k->p.offset))
1505 return 0;
1506
1507 s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1508 ret = PTR_ERR_OR_ZERO(s);
1509 if (ret)
1510 return ret;
1511
1512 darray_for_each(*deleted, i)
1513 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id);
1514
1515 if (!nr_deleted_ancestors)
1516 return 0;
1517
1518 le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1519
1520 if (!s->v.depth) {
1521 s->v.skip[0] = 0;
1522 s->v.skip[1] = 0;
1523 s->v.skip[2] = 0;
1524 } else {
1525 u32 depth = le32_to_cpu(s->v.depth);
1526 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1527
1528 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1529 u32 id = le32_to_cpu(s->v.skip[j]);
1530
1531 if (interior_delete_has_id(deleted, id)) {
1532 id = bch2_snapshot_nth_parent_skip(c,
1533 parent,
1534 depth > 1
1535 ? get_random_u32_below(depth - 1)
1536 : 0,
1537 deleted);
1538 s->v.skip[j] = cpu_to_le32(id);
1539 }
1540 }
1541
1542 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1543 }
1544
1545 return bch2_trans_update(trans, iter, &s->k_i, 0);
1546 }
1547
bch2_delete_dead_snapshots(struct bch_fs * c)1548 int bch2_delete_dead_snapshots(struct bch_fs *c)
1549 {
1550 if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1551 return 0;
1552
1553 struct btree_trans *trans = bch2_trans_get(c);
1554 snapshot_id_list delete_leaves = {};
1555 interior_delete_list delete_interior = {};
1556 int ret = 0;
1557
1558 /*
1559 * For every snapshot node: If we have no live children and it's not
1560 * pointed to by a subvolume, delete it:
1561 */
1562 ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
1563 check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
1564 if (!bch2_err_matches(ret, EROFS))
1565 bch_err_msg(c, ret, "walking snapshots");
1566 if (ret)
1567 goto err;
1568
1569 if (!delete_leaves.nr && !delete_interior.nr)
1570 goto err;
1571
1572 {
1573 struct printbuf buf = PRINTBUF;
1574 prt_printf(&buf, "deleting leaves");
1575 darray_for_each(delete_leaves, i)
1576 prt_printf(&buf, " %u", *i);
1577
1578 prt_printf(&buf, " interior");
1579 darray_for_each(delete_interior, i)
1580 prt_printf(&buf, " %u->%u", i->id, i->live_child);
1581
1582 ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
1583 printbuf_exit(&buf);
1584 if (ret)
1585 goto err;
1586 }
1587
1588 for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1589 struct disk_reservation res = { 0 };
1590
1591 if (!btree_type_has_snapshots(btree))
1592 continue;
1593
1594 ret = for_each_btree_key_commit(trans, iter,
1595 btree, POS_MIN,
1596 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1597 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1598 delete_dead_snapshots_process_key(trans, &iter, k,
1599 &delete_leaves,
1600 &delete_interior));
1601
1602 bch2_disk_reservation_put(c, &res);
1603
1604 if (!bch2_err_matches(ret, EROFS))
1605 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1606 if (ret)
1607 goto err;
1608 }
1609
1610 darray_for_each(delete_leaves, i) {
1611 ret = commit_do(trans, NULL, NULL, 0,
1612 bch2_snapshot_node_delete(trans, *i));
1613 if (!bch2_err_matches(ret, EROFS))
1614 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1615 if (ret)
1616 goto err;
1617 }
1618
1619 /*
1620 * Fixing children of deleted snapshots can't be done completely
1621 * atomically, if we crash between here and when we delete the interior
1622 * nodes some depth fields will be off:
1623 */
1624 ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1625 BTREE_ITER_intent, k,
1626 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1627 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
1628 if (ret)
1629 goto err;
1630
1631 darray_for_each(delete_interior, i) {
1632 ret = commit_do(trans, NULL, NULL, 0,
1633 bch2_snapshot_node_delete(trans, i->id));
1634 if (!bch2_err_matches(ret, EROFS))
1635 bch_err_msg(c, ret, "deleting snapshot %u", i->id);
1636 if (ret)
1637 goto err;
1638 }
1639 err:
1640 darray_exit(&delete_interior);
1641 darray_exit(&delete_leaves);
1642 bch2_trans_put(trans);
1643 if (!bch2_err_matches(ret, EROFS))
1644 bch_err_fn(c, ret);
1645 return ret;
1646 }
1647
bch2_delete_dead_snapshots_work(struct work_struct * work)1648 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1649 {
1650 struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1651
1652 set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
1653
1654 bch2_delete_dead_snapshots(c);
1655 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1656 }
1657
bch2_delete_dead_snapshots_async(struct bch_fs * c)1658 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1659 {
1660 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
1661 return;
1662
1663 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
1664
1665 if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1666 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1667 }
1668
__bch2_key_has_snapshot_overwrites(struct btree_trans * trans,enum btree_id id,struct bpos pos)1669 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1670 enum btree_id id,
1671 struct bpos pos)
1672 {
1673 struct bch_fs *c = trans->c;
1674 struct btree_iter iter;
1675 struct bkey_s_c k;
1676 int ret;
1677
1678 for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos),
1679 BTREE_ITER_not_extents|
1680 BTREE_ITER_all_snapshots,
1681 k, ret) {
1682 if (!bkey_eq(pos, k.k->p))
1683 break;
1684
1685 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1686 ret = 1;
1687 break;
1688 }
1689 }
1690 bch2_trans_iter_exit(trans, &iter);
1691
1692 return ret;
1693 }
1694
interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)1695 static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap)
1696 {
1697 /* If there's one child, it's redundant and keys will be moved to the child */
1698 return !!snap.v->children[0] + !!snap.v->children[1] == 1;
1699 }
1700
bch2_check_snapshot_needs_deletion(struct btree_trans * trans,struct bkey_s_c k)1701 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1702 {
1703 if (k.k->type != KEY_TYPE_snapshot)
1704 return 0;
1705
1706 struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
1707 if (BCH_SNAPSHOT_DELETED(snap.v) ||
1708 interior_snapshot_needs_delete(snap))
1709 set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
1710
1711 return 0;
1712 }
1713
bch2_snapshots_read(struct bch_fs * c)1714 int bch2_snapshots_read(struct bch_fs *c)
1715 {
1716 /*
1717 * Initializing the is_ancestor bitmaps requires ancestors to already be
1718 * initialized - so mark in reverse:
1719 */
1720 int ret = bch2_trans_run(c,
1721 for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots,
1722 POS_MAX, 0, k,
1723 __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1724 bch2_check_snapshot_needs_deletion(trans, k)));
1725 bch_err_fn(c, ret);
1726
1727 /*
1728 * It's important that we check if we need to reconstruct snapshots
1729 * before going RW, so we mark that pass as required in the superblock -
1730 * otherwise, we could end up deleting keys with missing snapshot nodes
1731 * instead
1732 */
1733 BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1734 test_bit(BCH_FS_may_go_rw, &c->flags));
1735
1736 if (bch2_err_matches(ret, EIO) ||
1737 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1738 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1739
1740 return ret;
1741 }
1742
bch2_fs_snapshots_exit(struct bch_fs * c)1743 void bch2_fs_snapshots_exit(struct bch_fs *c)
1744 {
1745 kvfree(rcu_dereference_protected(c->snapshots, true));
1746 }
1747