1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "namei.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25
dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,struct bch_inode_unpacked * inode)26 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
27 struct bch_inode_unpacked *inode)
28 {
29 if (d.v->d_type == DT_SUBVOL
30 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
31 : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
32 return 0;
33 return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
34 }
35
dirent_inode_mismatch_msg(struct printbuf * out,struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)36 static void dirent_inode_mismatch_msg(struct printbuf *out,
37 struct bch_fs *c,
38 struct bkey_s_c_dirent dirent,
39 struct bch_inode_unpacked *inode)
40 {
41 prt_str(out, "inode points to dirent that does not point back:");
42 prt_newline(out);
43 bch2_bkey_val_to_text(out, c, dirent.s_c);
44 prt_newline(out);
45 bch2_inode_unpacked_to_text(out, inode);
46 }
47
dirent_points_to_inode(struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)48 static int dirent_points_to_inode(struct bch_fs *c,
49 struct bkey_s_c_dirent dirent,
50 struct bch_inode_unpacked *inode)
51 {
52 int ret = dirent_points_to_inode_nowarn(dirent, inode);
53 if (ret) {
54 struct printbuf buf = PRINTBUF;
55 dirent_inode_mismatch_msg(&buf, c, dirent, inode);
56 bch_warn(c, "%s", buf.buf);
57 printbuf_exit(&buf);
58 }
59 return ret;
60 }
61
62 /*
63 * XXX: this is handling transaction restarts without returning
64 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
65 */
bch2_count_inode_sectors(struct btree_trans * trans,u64 inum,u32 snapshot)66 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
67 u32 snapshot)
68 {
69 u64 sectors = 0;
70
71 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
72 SPOS(inum, 0, snapshot),
73 POS(inum, U64_MAX),
74 0, k, ({
75 if (bkey_extent_is_allocation(k.k))
76 sectors += k.k->size;
77 0;
78 }));
79
80 return ret ?: sectors;
81 }
82
bch2_count_subdirs(struct btree_trans * trans,u64 inum,u32 snapshot)83 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
84 u32 snapshot)
85 {
86 u64 subdirs = 0;
87
88 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
89 SPOS(inum, 0, snapshot),
90 POS(inum, U64_MAX),
91 0, k, ({
92 if (k.k->type == KEY_TYPE_dirent &&
93 bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
94 subdirs++;
95 0;
96 }));
97
98 return ret ?: subdirs;
99 }
100
subvol_lookup(struct btree_trans * trans,u32 subvol,u32 * snapshot,u64 * inum)101 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
102 u32 *snapshot, u64 *inum)
103 {
104 struct bch_subvolume s;
105 int ret = bch2_subvolume_get(trans, subvol, false, &s);
106
107 *snapshot = le32_to_cpu(s.snapshot);
108 *inum = le64_to_cpu(s.inode);
109 return ret;
110 }
111
lookup_inode(struct btree_trans * trans,u64 inode_nr,u32 snapshot,struct bch_inode_unpacked * inode)112 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
113 struct bch_inode_unpacked *inode)
114 {
115 struct btree_iter iter;
116 struct bkey_s_c k;
117 int ret;
118
119 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
120 SPOS(0, inode_nr, snapshot), 0);
121 ret = bkey_err(k);
122 if (ret)
123 goto err;
124
125 ret = bkey_is_inode(k.k)
126 ? bch2_inode_unpack(k, inode)
127 : -BCH_ERR_ENOENT_inode;
128 err:
129 bch2_trans_iter_exit(trans, &iter);
130 return ret;
131 }
132
lookup_dirent_in_snapshot(struct btree_trans * trans,struct bch_hash_info hash_info,subvol_inum dir,struct qstr * name,u64 * target,unsigned * type,u32 snapshot)133 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
134 struct bch_hash_info hash_info,
135 subvol_inum dir, struct qstr *name,
136 u64 *target, unsigned *type, u32 snapshot)
137 {
138 struct btree_iter iter;
139 struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
140 &hash_info, dir, name, 0, snapshot);
141 int ret = bkey_err(k);
142 if (ret)
143 return ret;
144
145 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
146 *target = le64_to_cpu(d.v->d_inum);
147 *type = d.v->d_type;
148 bch2_trans_iter_exit(trans, &iter);
149 return 0;
150 }
151
152 /*
153 * Find any subvolume associated with a tree of snapshots
154 * We can't rely on master_subvol - it might have been deleted.
155 */
find_snapshot_tree_subvol(struct btree_trans * trans,u32 tree_id,u32 * subvol)156 static int find_snapshot_tree_subvol(struct btree_trans *trans,
157 u32 tree_id, u32 *subvol)
158 {
159 struct btree_iter iter;
160 struct bkey_s_c k;
161 int ret;
162
163 for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
164 if (k.k->type != KEY_TYPE_snapshot)
165 continue;
166
167 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
168 if (le32_to_cpu(s.v->tree) != tree_id)
169 continue;
170
171 if (s.v->subvol) {
172 *subvol = le32_to_cpu(s.v->subvol);
173 goto found;
174 }
175 }
176 ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
177 found:
178 bch2_trans_iter_exit(trans, &iter);
179 return ret;
180 }
181
182 /* Get lost+found, create if it doesn't exist: */
lookup_lostfound(struct btree_trans * trans,u32 snapshot,struct bch_inode_unpacked * lostfound,u64 reattaching_inum)183 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
184 struct bch_inode_unpacked *lostfound,
185 u64 reattaching_inum)
186 {
187 struct bch_fs *c = trans->c;
188 struct qstr lostfound_str = QSTR("lost+found");
189 struct btree_iter lostfound_iter = {};
190 u64 inum = 0;
191 unsigned d_type = 0;
192 int ret;
193
194 struct bch_snapshot_tree st;
195 ret = bch2_snapshot_tree_lookup(trans,
196 bch2_snapshot_tree(c, snapshot), &st);
197 if (ret)
198 return ret;
199
200 u32 subvolid;
201 ret = find_snapshot_tree_subvol(trans,
202 bch2_snapshot_tree(c, snapshot), &subvolid);
203 bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
204 bch2_snapshot_tree(c, snapshot));
205 if (ret)
206 return ret;
207
208 struct bch_subvolume subvol;
209 ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
210 bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
211 if (ret)
212 return ret;
213
214 if (!subvol.inode) {
215 struct btree_iter iter;
216 struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
217 BTREE_ID_subvolumes, POS(0, subvolid),
218 0, subvolume);
219 ret = PTR_ERR_OR_ZERO(subvol);
220 if (ret)
221 return ret;
222
223 subvol->v.inode = cpu_to_le64(reattaching_inum);
224 bch2_trans_iter_exit(trans, &iter);
225 }
226
227 subvol_inum root_inum = {
228 .subvol = subvolid,
229 .inum = le64_to_cpu(subvol.inode)
230 };
231
232 struct bch_inode_unpacked root_inode;
233 struct bch_hash_info root_hash_info;
234 ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
235 bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
236 root_inum.inum, subvolid);
237 if (ret)
238 return ret;
239
240 root_hash_info = bch2_hash_info_init(c, &root_inode);
241
242 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
243 &lostfound_str, &inum, &d_type, snapshot);
244 if (bch2_err_matches(ret, ENOENT))
245 goto create_lostfound;
246
247 bch_err_fn(c, ret);
248 if (ret)
249 return ret;
250
251 if (d_type != DT_DIR) {
252 bch_err(c, "error looking up lost+found: not a directory");
253 return -BCH_ERR_ENOENT_not_directory;
254 }
255
256 /*
257 * The bch2_check_dirents pass has already run, dangling dirents
258 * shouldn't exist here:
259 */
260 ret = lookup_inode(trans, inum, snapshot, lostfound);
261 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
262 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
263 return ret;
264
265 create_lostfound:
266 /*
267 * we always create lost+found in the root snapshot; we don't want
268 * different branches of the snapshot tree to have different lost+found
269 */
270 snapshot = le32_to_cpu(st.root_snapshot);
271 /*
272 * XXX: we could have a nicer log message here if we had a nice way to
273 * walk backpointers to print a path
274 */
275 struct printbuf path = PRINTBUF;
276 ret = bch2_inum_to_path(trans, root_inum, &path);
277 if (ret)
278 goto err;
279
280 bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
281 path.buf, root_inum.subvol, snapshot);
282 printbuf_exit(&path);
283
284 u64 now = bch2_current_time(c);
285 u64 cpu = raw_smp_processor_id();
286
287 bch2_inode_init_early(c, lostfound);
288 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
289 lostfound->bi_dir = root_inode.bi_inum;
290 lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
291
292 root_inode.bi_nlink++;
293
294 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
295 if (ret)
296 goto err;
297
298 bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
299 ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
300 if (ret)
301 goto err;
302
303 ret = bch2_dirent_create_snapshot(trans,
304 0, root_inode.bi_inum, snapshot, &root_hash_info,
305 mode_to_type(lostfound->bi_mode),
306 &lostfound_str,
307 lostfound->bi_inum,
308 &lostfound->bi_dir_offset,
309 BTREE_UPDATE_internal_snapshot_node|
310 STR_HASH_must_create) ?:
311 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
312 BTREE_UPDATE_internal_snapshot_node);
313 err:
314 bch_err_msg(c, ret, "creating lost+found");
315 bch2_trans_iter_exit(trans, &lostfound_iter);
316 return ret;
317 }
318
inode_should_reattach(struct bch_inode_unpacked * inode)319 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
320 {
321 if (inode->bi_inum == BCACHEFS_ROOT_INO &&
322 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
323 return false;
324
325 /*
326 * Subvolume roots are special: older versions of subvolume roots may be
327 * disconnected, it's only the newest version that matters.
328 *
329 * We only keep a single dirent pointing to a subvolume root, i.e.
330 * older versions of snapshots will not have a different dirent pointing
331 * to the same subvolume root.
332 *
333 * This is because dirents that point to subvolumes are only visible in
334 * the parent subvolume - versioning is not needed - and keeping them
335 * around would break fsck, because when we're crossing subvolumes we
336 * don't have a consistent snapshot ID to do check the inode <-> dirent
337 * relationships.
338 *
339 * Thus, a subvolume root that's been renamed after a snapshot will have
340 * a disconnected older version - that's expected.
341 *
342 * Note that taking a snapshot always updates the root inode (to update
343 * the dirent backpointer), so a subvolume root inode with
344 * BCH_INODE_has_child_snapshot is never visible.
345 */
346 if (inode->bi_subvol &&
347 (inode->bi_flags & BCH_INODE_has_child_snapshot))
348 return false;
349
350 return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
351 }
352
maybe_delete_dirent(struct btree_trans * trans,struct bpos d_pos,u32 snapshot)353 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
354 {
355 struct btree_iter iter;
356 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
357 SPOS(d_pos.inode, d_pos.offset, snapshot),
358 BTREE_ITER_intent|
359 BTREE_ITER_with_updates);
360 int ret = bkey_err(k);
361 if (ret)
362 return ret;
363
364 if (bpos_eq(k.k->p, d_pos)) {
365 /*
366 * delet_at() doesn't work because the update path doesn't
367 * internally use BTREE_ITER_with_updates yet
368 */
369 struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
370 ret = PTR_ERR_OR_ZERO(k);
371 if (ret)
372 goto err;
373
374 bkey_init(&k->k);
375 k->k.type = KEY_TYPE_whiteout;
376 k->k.p = iter.pos;
377 ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
378 }
379 err:
380 bch2_trans_iter_exit(trans, &iter);
381 return ret;
382 }
383
reattach_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode)384 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
385 {
386 struct bch_fs *c = trans->c;
387 struct bch_inode_unpacked lostfound;
388 char name_buf[20];
389 int ret;
390
391 u32 dirent_snapshot = inode->bi_snapshot;
392 if (inode->bi_subvol) {
393 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
394
395 u64 root_inum;
396 ret = subvol_lookup(trans, inode->bi_parent_subvol,
397 &dirent_snapshot, &root_inum);
398 if (ret)
399 return ret;
400
401 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
402 } else {
403 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
404 }
405
406 ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
407 if (ret)
408 return ret;
409
410 lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
411
412 /* ensure lost+found inode is also present in inode snapshot */
413 if (!inode->bi_subvol) {
414 BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
415 lostfound.bi_snapshot = inode->bi_snapshot;
416 }
417
418 ret = __bch2_fsck_write_inode(trans, &lostfound);
419 if (ret)
420 return ret;
421
422 struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
423 struct qstr name = QSTR(name_buf);
424
425 inode->bi_dir = lostfound.bi_inum;
426
427 ret = bch2_dirent_create_snapshot(trans,
428 inode->bi_parent_subvol, lostfound.bi_inum,
429 dirent_snapshot,
430 &dir_hash,
431 inode_d_type(inode),
432 &name,
433 inode->bi_subvol ?: inode->bi_inum,
434 &inode->bi_dir_offset,
435 BTREE_UPDATE_internal_snapshot_node|
436 STR_HASH_must_create);
437 if (ret) {
438 bch_err_msg(c, ret, "error creating dirent");
439 return ret;
440 }
441
442 ret = __bch2_fsck_write_inode(trans, inode);
443 if (ret)
444 return ret;
445
446 /*
447 * Fix up inodes in child snapshots: if they should also be reattached
448 * update the backpointer field, if they should not be we need to emit
449 * whiteouts for the dirent we just created.
450 */
451 if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
452 snapshot_id_list whiteouts_done;
453 struct btree_iter iter;
454 struct bkey_s_c k;
455
456 darray_init(&whiteouts_done);
457
458 for_each_btree_key_reverse_norestart(trans, iter,
459 BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
460 BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
461 if (k.k->p.offset != inode->bi_inum)
462 break;
463
464 if (!bkey_is_inode(k.k) ||
465 !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
466 snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
467 continue;
468
469 struct bch_inode_unpacked child_inode;
470 ret = bch2_inode_unpack(k, &child_inode);
471 if (ret)
472 break;
473
474 if (!inode_should_reattach(&child_inode)) {
475 ret = maybe_delete_dirent(trans,
476 SPOS(lostfound.bi_inum, inode->bi_dir_offset,
477 dirent_snapshot),
478 k.k->p.snapshot);
479 if (ret)
480 break;
481
482 ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
483 if (ret)
484 break;
485 } else {
486 iter.snapshot = k.k->p.snapshot;
487 child_inode.bi_dir = inode->bi_dir;
488 child_inode.bi_dir_offset = inode->bi_dir_offset;
489
490 ret = bch2_inode_write_flags(trans, &iter, &child_inode,
491 BTREE_UPDATE_internal_snapshot_node);
492 if (ret)
493 break;
494 }
495 }
496 darray_exit(&whiteouts_done);
497 bch2_trans_iter_exit(trans, &iter);
498 }
499
500 return ret;
501 }
502
dirent_get_by_pos(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)503 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
504 struct btree_iter *iter,
505 struct bpos pos)
506 {
507 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
508 }
509
remove_backpointer(struct btree_trans * trans,struct bch_inode_unpacked * inode)510 static int remove_backpointer(struct btree_trans *trans,
511 struct bch_inode_unpacked *inode)
512 {
513 if (!inode->bi_dir)
514 return 0;
515
516 struct bch_fs *c = trans->c;
517 struct btree_iter iter;
518 struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
519 SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
520 int ret = bkey_err(d) ?:
521 dirent_points_to_inode(c, d, inode) ?:
522 bch2_fsck_remove_dirent(trans, d.k->p);
523 bch2_trans_iter_exit(trans, &iter);
524 return ret;
525 }
526
reattach_subvol(struct btree_trans * trans,struct bkey_s_c_subvolume s)527 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
528 {
529 struct bch_fs *c = trans->c;
530
531 struct bch_inode_unpacked inode;
532 int ret = bch2_inode_find_by_inum_trans(trans,
533 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
534 &inode);
535 if (ret)
536 return ret;
537
538 ret = remove_backpointer(trans, &inode);
539 if (!bch2_err_matches(ret, ENOENT))
540 bch_err_msg(c, ret, "removing dirent");
541 if (ret)
542 return ret;
543
544 ret = reattach_inode(trans, &inode);
545 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
546 return ret;
547 }
548
reconstruct_subvol(struct btree_trans * trans,u32 snapshotid,u32 subvolid,u64 inum)549 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
550 {
551 struct bch_fs *c = trans->c;
552
553 if (!bch2_snapshot_is_leaf(c, snapshotid)) {
554 bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
555 return -BCH_ERR_fsck_repair_unimplemented;
556 }
557
558 /*
559 * If inum isn't set, that means we're being called from check_dirents,
560 * not check_inodes - the root of this subvolume doesn't exist or we
561 * would have found it there:
562 */
563 if (!inum) {
564 struct btree_iter inode_iter = {};
565 struct bch_inode_unpacked new_inode;
566 u64 cpu = raw_smp_processor_id();
567
568 bch2_inode_init_early(c, &new_inode);
569 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
570
571 new_inode.bi_subvol = subvolid;
572
573 int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
574 bch2_btree_iter_traverse(trans, &inode_iter) ?:
575 bch2_inode_write(trans, &inode_iter, &new_inode);
576 bch2_trans_iter_exit(trans, &inode_iter);
577 if (ret)
578 return ret;
579
580 inum = new_inode.bi_inum;
581 }
582
583 bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
584
585 struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
586 int ret = PTR_ERR_OR_ZERO(new_subvol);
587 if (ret)
588 return ret;
589
590 bkey_subvolume_init(&new_subvol->k_i);
591 new_subvol->k.p.offset = subvolid;
592 new_subvol->v.snapshot = cpu_to_le32(snapshotid);
593 new_subvol->v.inode = cpu_to_le64(inum);
594 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
595 if (ret)
596 return ret;
597
598 struct btree_iter iter;
599 struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
600 BTREE_ID_snapshots, POS(0, snapshotid),
601 0, snapshot);
602 ret = PTR_ERR_OR_ZERO(s);
603 bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
604 if (ret)
605 return ret;
606
607 u32 snapshot_tree = le32_to_cpu(s->v.tree);
608
609 s->v.subvol = cpu_to_le32(subvolid);
610 SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
611 bch2_trans_iter_exit(trans, &iter);
612
613 struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
614 BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
615 0, snapshot_tree);
616 ret = PTR_ERR_OR_ZERO(st);
617 bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
618 if (ret)
619 return ret;
620
621 if (!st->v.master_subvol)
622 st->v.master_subvol = cpu_to_le32(subvolid);
623
624 bch2_trans_iter_exit(trans, &iter);
625 return 0;
626 }
627
reconstruct_inode(struct btree_trans * trans,enum btree_id btree,u32 snapshot,u64 inum)628 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
629 {
630 struct bch_fs *c = trans->c;
631 unsigned i_mode = S_IFREG;
632 u64 i_size = 0;
633
634 switch (btree) {
635 case BTREE_ID_extents: {
636 struct btree_iter iter = {};
637
638 bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
639 struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
640 bch2_trans_iter_exit(trans, &iter);
641 int ret = bkey_err(k);
642 if (ret)
643 return ret;
644
645 i_size = k.k->p.offset << 9;
646 break;
647 }
648 case BTREE_ID_dirents:
649 i_mode = S_IFDIR;
650 break;
651 case BTREE_ID_xattrs:
652 break;
653 default:
654 BUG();
655 }
656
657 struct bch_inode_unpacked new_inode;
658 bch2_inode_init_early(c, &new_inode);
659 bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
660 new_inode.bi_size = i_size;
661 new_inode.bi_inum = inum;
662 new_inode.bi_snapshot = snapshot;
663
664 return __bch2_fsck_write_inode(trans, &new_inode);
665 }
666
667 struct snapshots_seen {
668 struct bpos pos;
669 snapshot_id_list ids;
670 };
671
snapshots_seen_exit(struct snapshots_seen * s)672 static inline void snapshots_seen_exit(struct snapshots_seen *s)
673 {
674 darray_exit(&s->ids);
675 }
676
snapshots_seen_init(struct snapshots_seen * s)677 static inline void snapshots_seen_init(struct snapshots_seen *s)
678 {
679 memset(s, 0, sizeof(*s));
680 }
681
snapshots_seen_add_inorder(struct bch_fs * c,struct snapshots_seen * s,u32 id)682 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
683 {
684 u32 *i;
685 __darray_for_each(s->ids, i) {
686 if (*i == id)
687 return 0;
688 if (*i > id)
689 break;
690 }
691
692 int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
693 if (ret)
694 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
695 s->ids.size);
696 return ret;
697 }
698
snapshots_seen_update(struct bch_fs * c,struct snapshots_seen * s,enum btree_id btree_id,struct bpos pos)699 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
700 enum btree_id btree_id, struct bpos pos)
701 {
702 if (!bkey_eq(s->pos, pos))
703 s->ids.nr = 0;
704 s->pos = pos;
705
706 return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
707 }
708
709 /**
710 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
711 * and @ancestor hasn't been overwritten in @seen
712 *
713 * @c: filesystem handle
714 * @seen: list of snapshot ids already seen at current position
715 * @id: descendent snapshot id
716 * @ancestor: ancestor snapshot id
717 *
718 * Returns: whether key in @ancestor snapshot is visible in @id snapshot
719 */
key_visible_in_snapshot(struct bch_fs * c,struct snapshots_seen * seen,u32 id,u32 ancestor)720 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
721 u32 id, u32 ancestor)
722 {
723 ssize_t i;
724
725 EBUG_ON(id > ancestor);
726
727 /* @ancestor should be the snapshot most recently added to @seen */
728 EBUG_ON(ancestor != seen->pos.snapshot);
729 EBUG_ON(ancestor != darray_last(seen->ids));
730
731 if (id == ancestor)
732 return true;
733
734 if (!bch2_snapshot_is_ancestor(c, id, ancestor))
735 return false;
736
737 /*
738 * We know that @id is a descendant of @ancestor, we're checking if
739 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
740 * @ascestor and with @id as a descendent.
741 *
742 * But we already know that we're scanning IDs between @id and @ancestor
743 * numerically, since snapshot ID lists are kept sorted, so if we find
744 * an id that's an ancestor of @id we're done:
745 */
746
747 for (i = seen->ids.nr - 2;
748 i >= 0 && seen->ids.data[i] >= id;
749 --i)
750 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
751 return false;
752
753 return true;
754 }
755
756 /**
757 * ref_visible - given a key with snapshot id @src that points to a key with
758 * snapshot id @dst, test whether there is some snapshot in which @dst is
759 * visible.
760 *
761 * @c: filesystem handle
762 * @s: list of snapshot IDs already seen at @src
763 * @src: snapshot ID of src key
764 * @dst: snapshot ID of dst key
765 * Returns: true if there is some snapshot in which @dst is visible
766 *
767 * Assumes we're visiting @src keys in natural key order
768 */
ref_visible(struct bch_fs * c,struct snapshots_seen * s,u32 src,u32 dst)769 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
770 u32 src, u32 dst)
771 {
772 return dst <= src
773 ? key_visible_in_snapshot(c, s, dst, src)
774 : bch2_snapshot_is_ancestor(c, src, dst);
775 }
776
ref_visible2(struct bch_fs * c,u32 src,struct snapshots_seen * src_seen,u32 dst,struct snapshots_seen * dst_seen)777 static int ref_visible2(struct bch_fs *c,
778 u32 src, struct snapshots_seen *src_seen,
779 u32 dst, struct snapshots_seen *dst_seen)
780 {
781 if (dst > src) {
782 swap(dst, src);
783 swap(dst_seen, src_seen);
784 }
785 return key_visible_in_snapshot(c, src_seen, dst, src);
786 }
787
788 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
789 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
790 (_i)->snapshot <= (_snapshot); _i++) \
791 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
792
793 struct inode_walker_entry {
794 struct bch_inode_unpacked inode;
795 u32 snapshot;
796 u64 count;
797 u64 i_size;
798 };
799
800 struct inode_walker {
801 bool first_this_inode;
802 bool have_inodes;
803 bool recalculate_sums;
804 struct bpos last_pos;
805
806 DARRAY(struct inode_walker_entry) inodes;
807 snapshot_id_list deletes;
808 };
809
inode_walker_exit(struct inode_walker * w)810 static void inode_walker_exit(struct inode_walker *w)
811 {
812 darray_exit(&w->inodes);
813 darray_exit(&w->deletes);
814 }
815
inode_walker_init(void)816 static struct inode_walker inode_walker_init(void)
817 {
818 return (struct inode_walker) { 0, };
819 }
820
add_inode(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c inode)821 static int add_inode(struct bch_fs *c, struct inode_walker *w,
822 struct bkey_s_c inode)
823 {
824 struct bch_inode_unpacked u;
825
826 return bch2_inode_unpack(inode, &u) ?:
827 darray_push(&w->inodes, ((struct inode_walker_entry) {
828 .inode = u,
829 .snapshot = inode.k->p.snapshot,
830 }));
831 }
832
get_inodes_all_snapshots(struct btree_trans * trans,struct inode_walker * w,u64 inum)833 static int get_inodes_all_snapshots(struct btree_trans *trans,
834 struct inode_walker *w, u64 inum)
835 {
836 struct bch_fs *c = trans->c;
837 struct btree_iter iter;
838 struct bkey_s_c k;
839 int ret;
840
841 /*
842 * We no longer have inodes for w->last_pos; clear this to avoid
843 * screwing up check_i_sectors/check_subdir_count if we take a
844 * transaction restart here:
845 */
846 w->have_inodes = false;
847 w->recalculate_sums = false;
848 w->inodes.nr = 0;
849
850 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
851 BTREE_ITER_all_snapshots, k, ret) {
852 if (k.k->p.offset != inum)
853 break;
854
855 if (bkey_is_inode(k.k))
856 add_inode(c, w, k);
857 }
858 bch2_trans_iter_exit(trans, &iter);
859
860 if (ret)
861 return ret;
862
863 w->first_this_inode = true;
864 w->have_inodes = true;
865 return 0;
866 }
867
868 static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c k)869 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
870 {
871 bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
872
873 struct inode_walker_entry *i;
874 __darray_for_each(w->inodes, i)
875 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
876 goto found;
877
878 return NULL;
879 found:
880 BUG_ON(k.k->p.snapshot > i->snapshot);
881
882 if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
883 struct inode_walker_entry new = *i;
884
885 new.snapshot = k.k->p.snapshot;
886 new.count = 0;
887 new.i_size = 0;
888
889 struct printbuf buf = PRINTBUF;
890 bch2_bkey_val_to_text(&buf, c, k);
891
892 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
893 "unexpected because we should always update the inode when we update a key in that inode\n"
894 "%s",
895 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
896 printbuf_exit(&buf);
897
898 while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
899 --i;
900
901 size_t pos = i - w->inodes.data;
902 int ret = darray_insert_item(&w->inodes, pos, new);
903 if (ret)
904 return ERR_PTR(ret);
905
906 i = w->inodes.data + pos;
907 }
908
909 return i;
910 }
911
walk_inode(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)912 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
913 struct inode_walker *w,
914 struct bkey_s_c k)
915 {
916 if (w->last_pos.inode != k.k->p.inode) {
917 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
918 if (ret)
919 return ERR_PTR(ret);
920 }
921
922 w->last_pos = k.k->p;
923
924 return lookup_inode_for_snapshot(trans->c, w, k);
925 }
926
get_visible_inodes(struct btree_trans * trans,struct inode_walker * w,struct snapshots_seen * s,u64 inum)927 static int get_visible_inodes(struct btree_trans *trans,
928 struct inode_walker *w,
929 struct snapshots_seen *s,
930 u64 inum)
931 {
932 struct bch_fs *c = trans->c;
933 struct btree_iter iter;
934 struct bkey_s_c k;
935 int ret;
936
937 w->inodes.nr = 0;
938 w->deletes.nr = 0;
939
940 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
941 BTREE_ITER_all_snapshots, k, ret) {
942 if (k.k->p.offset != inum)
943 break;
944
945 if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
946 continue;
947
948 if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
949 continue;
950
951 ret = bkey_is_inode(k.k)
952 ? add_inode(c, w, k)
953 : snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
954 if (ret)
955 break;
956 }
957 bch2_trans_iter_exit(trans, &iter);
958
959 return ret;
960 }
961
962 /*
963 * Prefer to delete the first one, since that will be the one at the wrong
964 * offset:
965 * return value: 0 -> delete k1, 1 -> delete k2
966 */
bch2_fsck_update_backpointers(struct btree_trans * trans,struct snapshots_seen * s,const struct bch_hash_desc desc,struct bch_hash_info * hash_info,struct bkey_i * new)967 int bch2_fsck_update_backpointers(struct btree_trans *trans,
968 struct snapshots_seen *s,
969 const struct bch_hash_desc desc,
970 struct bch_hash_info *hash_info,
971 struct bkey_i *new)
972 {
973 if (new->k.type != KEY_TYPE_dirent)
974 return 0;
975
976 struct bkey_i_dirent *d = bkey_i_to_dirent(new);
977 struct inode_walker target = inode_walker_init();
978 int ret = 0;
979
980 if (d->v.d_type == DT_SUBVOL) {
981 BUG();
982 } else {
983 ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
984 if (ret)
985 goto err;
986
987 darray_for_each(target.inodes, i) {
988 i->inode.bi_dir_offset = d->k.p.offset;
989 ret = __bch2_fsck_write_inode(trans, &i->inode);
990 if (ret)
991 goto err;
992 }
993 }
994 err:
995 inode_walker_exit(&target);
996 return ret;
997 }
998
inode_get_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bch_inode_unpacked * inode,u32 * snapshot)999 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1000 struct btree_iter *iter,
1001 struct bch_inode_unpacked *inode,
1002 u32 *snapshot)
1003 {
1004 if (inode->bi_subvol) {
1005 u64 inum;
1006 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1007 if (ret)
1008 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1009 }
1010
1011 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1012 }
1013
check_inode_deleted_list(struct btree_trans * trans,struct bpos p)1014 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1015 {
1016 struct btree_iter iter;
1017 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1018 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1019 bch2_trans_iter_exit(trans, &iter);
1020 return ret;
1021 }
1022
check_inode_dirent_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode,bool * write_inode)1023 static int check_inode_dirent_inode(struct btree_trans *trans,
1024 struct bch_inode_unpacked *inode,
1025 bool *write_inode)
1026 {
1027 struct bch_fs *c = trans->c;
1028 struct printbuf buf = PRINTBUF;
1029
1030 u32 inode_snapshot = inode->bi_snapshot;
1031 struct btree_iter dirent_iter = {};
1032 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1033 int ret = bkey_err(d);
1034 if (ret && !bch2_err_matches(ret, ENOENT))
1035 return ret;
1036
1037 if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
1038 inode->bi_subvol &&
1039 (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
1040 /* Older version of a renamed subvolume root: we won't have a
1041 * correct dirent for it. That's expected, see
1042 * inode_should_reattach().
1043 *
1044 * We don't clear the backpointer field when doing the rename
1045 * because there might be arbitrarily many versions in older
1046 * snapshots.
1047 */
1048 inode->bi_dir = 0;
1049 inode->bi_dir_offset = 0;
1050 *write_inode = true;
1051 goto out;
1052 }
1053
1054 if (fsck_err_on(ret,
1055 trans, inode_points_to_missing_dirent,
1056 "inode points to missing dirent\n%s",
1057 (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1058 fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1059 trans, inode_points_to_wrong_dirent,
1060 "%s",
1061 (printbuf_reset(&buf),
1062 dirent_inode_mismatch_msg(&buf, c, d, inode),
1063 buf.buf))) {
1064 /*
1065 * We just clear the backpointer fields for now. If we find a
1066 * dirent that points to this inode in check_dirents(), we'll
1067 * update it then; then when we get to check_path() if the
1068 * backpointer is still 0 we'll reattach it.
1069 */
1070 inode->bi_dir = 0;
1071 inode->bi_dir_offset = 0;
1072 *write_inode = true;
1073 }
1074 out:
1075 ret = 0;
1076 fsck_err:
1077 bch2_trans_iter_exit(trans, &dirent_iter);
1078 printbuf_exit(&buf);
1079 bch_err_fn(c, ret);
1080 return ret;
1081 }
1082
get_snapshot_root_inode(struct btree_trans * trans,struct bch_inode_unpacked * root,u64 inum)1083 static int get_snapshot_root_inode(struct btree_trans *trans,
1084 struct bch_inode_unpacked *root,
1085 u64 inum)
1086 {
1087 struct btree_iter iter;
1088 struct bkey_s_c k;
1089 int ret = 0;
1090
1091 for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1092 SPOS(0, inum, U32_MAX),
1093 BTREE_ITER_all_snapshots, k, ret) {
1094 if (k.k->p.offset != inum)
1095 break;
1096 if (bkey_is_inode(k.k))
1097 goto found_root;
1098 }
1099 if (ret)
1100 goto err;
1101 BUG();
1102 found_root:
1103 ret = bch2_inode_unpack(k, root);
1104 err:
1105 bch2_trans_iter_exit(trans, &iter);
1106 return ret;
1107 }
1108
check_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_inode_unpacked * snapshot_root,struct snapshots_seen * s)1109 static int check_inode(struct btree_trans *trans,
1110 struct btree_iter *iter,
1111 struct bkey_s_c k,
1112 struct bch_inode_unpacked *snapshot_root,
1113 struct snapshots_seen *s)
1114 {
1115 struct bch_fs *c = trans->c;
1116 struct printbuf buf = PRINTBUF;
1117 struct bch_inode_unpacked u;
1118 bool do_update = false;
1119 int ret;
1120
1121 ret = bch2_check_key_has_snapshot(trans, iter, k);
1122 if (ret < 0)
1123 goto err;
1124 if (ret)
1125 return 0;
1126
1127 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1128 if (ret)
1129 goto err;
1130
1131 if (!bkey_is_inode(k.k))
1132 return 0;
1133
1134 ret = bch2_inode_unpack(k, &u);
1135 if (ret)
1136 goto err;
1137
1138 if (snapshot_root->bi_inum != u.bi_inum) {
1139 ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1140 if (ret)
1141 goto err;
1142 }
1143
1144 if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
1145 INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
1146 trans, inode_snapshot_mismatch,
1147 "inode hash info in different snapshots don't match")) {
1148 u.bi_hash_seed = snapshot_root->bi_hash_seed;
1149 SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1150 do_update = true;
1151 }
1152
1153 if (u.bi_dir || u.bi_dir_offset) {
1154 ret = check_inode_dirent_inode(trans, &u, &do_update);
1155 if (ret)
1156 goto err;
1157 }
1158
1159 if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1160 trans, inode_unlinked_but_has_dirent,
1161 "inode unlinked but has dirent\n%s",
1162 (printbuf_reset(&buf),
1163 bch2_inode_unpacked_to_text(&buf, &u),
1164 buf.buf))) {
1165 u.bi_flags &= ~BCH_INODE_unlinked;
1166 do_update = true;
1167 }
1168
1169 if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1170 /* Check for this early so that check_unreachable_inode() will reattach it */
1171
1172 ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1173 if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1174 goto err;
1175
1176 fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1177 "dir unlinked but not empty\n%s",
1178 (printbuf_reset(&buf),
1179 bch2_inode_unpacked_to_text(&buf, &u),
1180 buf.buf));
1181 u.bi_flags &= ~BCH_INODE_unlinked;
1182 do_update = true;
1183 ret = 0;
1184 }
1185
1186 ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1187 if (ret < 0)
1188 goto err;
1189
1190 if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1191 trans, inode_has_child_snapshots_wrong,
1192 "inode has_child_snapshots flag wrong (should be %u)\n%s",
1193 ret,
1194 (printbuf_reset(&buf),
1195 bch2_inode_unpacked_to_text(&buf, &u),
1196 buf.buf))) {
1197 if (ret)
1198 u.bi_flags |= BCH_INODE_has_child_snapshot;
1199 else
1200 u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1201 do_update = true;
1202 }
1203 ret = 0;
1204
1205 if ((u.bi_flags & BCH_INODE_unlinked) &&
1206 !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1207 if (!test_bit(BCH_FS_started, &c->flags)) {
1208 /*
1209 * If we're not in online fsck, don't delete unlinked
1210 * inodes, just make sure they're on the deleted list.
1211 *
1212 * They might be referred to by a logged operation -
1213 * i.e. we might have crashed in the middle of a
1214 * truncate on an unlinked but open file - so we want to
1215 * let the delete_dead_inodes kill it after resuming
1216 * logged ops.
1217 */
1218 ret = check_inode_deleted_list(trans, k.k->p);
1219 if (ret < 0)
1220 goto err_noprint;
1221
1222 fsck_err_on(!ret,
1223 trans, unlinked_inode_not_on_deleted_list,
1224 "inode %llu:%u unlinked, but not on deleted list",
1225 u.bi_inum, k.k->p.snapshot);
1226
1227 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1228 if (ret)
1229 goto err;
1230 } else {
1231 ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1232 if (ret < 0)
1233 goto err;
1234
1235 if (fsck_err_on(!ret,
1236 trans, inode_unlinked_and_not_open,
1237 "inode %llu:%u unlinked and not open",
1238 u.bi_inum, u.bi_snapshot)) {
1239 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1240 bch_err_msg(c, ret, "in fsck deleting inode");
1241 goto err_noprint;
1242 }
1243 ret = 0;
1244 }
1245 }
1246
1247 if (fsck_err_on(u.bi_parent_subvol &&
1248 (u.bi_subvol == 0 ||
1249 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1250 trans, inode_bi_parent_nonzero,
1251 "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1252 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1253 u.bi_parent_subvol = 0;
1254 do_update = true;
1255 }
1256
1257 if (u.bi_subvol) {
1258 struct bch_subvolume s;
1259
1260 ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1261 if (ret && !bch2_err_matches(ret, ENOENT))
1262 goto err;
1263
1264 if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1265 ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1266 goto do_update;
1267 }
1268
1269 if (fsck_err_on(ret,
1270 trans, inode_bi_subvol_missing,
1271 "inode %llu:%u bi_subvol points to missing subvolume %u",
1272 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1273 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1274 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1275 k.k->p.snapshot),
1276 trans, inode_bi_subvol_wrong,
1277 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1278 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1279 le64_to_cpu(s.inode),
1280 le32_to_cpu(s.snapshot))) {
1281 u.bi_subvol = 0;
1282 u.bi_parent_subvol = 0;
1283 do_update = true;
1284 }
1285 }
1286
1287 if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1288 trans, inode_journal_seq_in_future,
1289 "inode journal seq in future (currently at %llu)\n%s",
1290 journal_cur_seq(&c->journal),
1291 (printbuf_reset(&buf),
1292 bch2_inode_unpacked_to_text(&buf, &u),
1293 buf.buf))) {
1294 u.bi_journal_seq = journal_cur_seq(&c->journal);
1295 do_update = true;
1296 }
1297 do_update:
1298 if (do_update) {
1299 ret = __bch2_fsck_write_inode(trans, &u);
1300 bch_err_msg(c, ret, "in fsck updating inode");
1301 if (ret)
1302 goto err_noprint;
1303 }
1304 err:
1305 fsck_err:
1306 bch_err_fn(c, ret);
1307 err_noprint:
1308 printbuf_exit(&buf);
1309 return ret;
1310 }
1311
bch2_check_inodes(struct bch_fs * c)1312 int bch2_check_inodes(struct bch_fs *c)
1313 {
1314 struct bch_inode_unpacked snapshot_root = {};
1315 struct snapshots_seen s;
1316
1317 snapshots_seen_init(&s);
1318
1319 int ret = bch2_trans_run(c,
1320 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1321 POS_MIN,
1322 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1323 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1324 check_inode(trans, &iter, k, &snapshot_root, &s)));
1325
1326 snapshots_seen_exit(&s);
1327 bch_err_fn(c, ret);
1328 return ret;
1329 }
1330
find_oldest_inode_needs_reattach(struct btree_trans * trans,struct bch_inode_unpacked * inode)1331 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1332 struct bch_inode_unpacked *inode)
1333 {
1334 struct bch_fs *c = trans->c;
1335 struct btree_iter iter;
1336 struct bkey_s_c k;
1337 int ret = 0;
1338
1339 /*
1340 * We look for inodes to reattach in natural key order, leaves first,
1341 * but we should do the reattach at the oldest version that needs to be
1342 * reattached:
1343 */
1344 for_each_btree_key_norestart(trans, iter,
1345 BTREE_ID_inodes,
1346 SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1347 BTREE_ITER_all_snapshots, k, ret) {
1348 if (k.k->p.offset != inode->bi_inum)
1349 break;
1350
1351 if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1352 continue;
1353
1354 if (!bkey_is_inode(k.k))
1355 break;
1356
1357 struct bch_inode_unpacked parent_inode;
1358 ret = bch2_inode_unpack(k, &parent_inode);
1359 if (ret)
1360 break;
1361
1362 if (!inode_should_reattach(&parent_inode))
1363 break;
1364
1365 *inode = parent_inode;
1366 }
1367 bch2_trans_iter_exit(trans, &iter);
1368
1369 return ret;
1370 }
1371
check_unreachable_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1372 static int check_unreachable_inode(struct btree_trans *trans,
1373 struct btree_iter *iter,
1374 struct bkey_s_c k)
1375 {
1376 struct printbuf buf = PRINTBUF;
1377 int ret = 0;
1378
1379 if (!bkey_is_inode(k.k))
1380 return 0;
1381
1382 struct bch_inode_unpacked inode;
1383 ret = bch2_inode_unpack(k, &inode);
1384 if (ret)
1385 return ret;
1386
1387 if (!inode_should_reattach(&inode))
1388 return 0;
1389
1390 ret = find_oldest_inode_needs_reattach(trans, &inode);
1391 if (ret)
1392 return ret;
1393
1394 if (fsck_err(trans, inode_unreachable,
1395 "unreachable inode:\n%s",
1396 (bch2_inode_unpacked_to_text(&buf, &inode),
1397 buf.buf)))
1398 ret = reattach_inode(trans, &inode);
1399 fsck_err:
1400 printbuf_exit(&buf);
1401 return ret;
1402 }
1403
1404 /*
1405 * Reattach unreachable (but not unlinked) inodes
1406 *
1407 * Run after check_inodes() and check_dirents(), so we node that inode
1408 * backpointer fields point to valid dirents, and every inode that has a dirent
1409 * that points to it has its backpointer field set - so we're just looking for
1410 * non-unlinked inodes without backpointers:
1411 *
1412 * XXX: this is racy w.r.t. hardlink removal in online fsck
1413 */
bch2_check_unreachable_inodes(struct bch_fs * c)1414 int bch2_check_unreachable_inodes(struct bch_fs *c)
1415 {
1416 int ret = bch2_trans_run(c,
1417 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1418 POS_MIN,
1419 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1420 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1421 check_unreachable_inode(trans, &iter, k)));
1422 bch_err_fn(c, ret);
1423 return ret;
1424 }
1425
btree_matches_i_mode(enum btree_id btree,unsigned mode)1426 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1427 {
1428 switch (btree) {
1429 case BTREE_ID_extents:
1430 return S_ISREG(mode) || S_ISLNK(mode);
1431 case BTREE_ID_dirents:
1432 return S_ISDIR(mode);
1433 case BTREE_ID_xattrs:
1434 return true;
1435 default:
1436 BUG();
1437 }
1438 }
1439
check_key_has_inode(struct btree_trans * trans,struct btree_iter * iter,struct inode_walker * inode,struct inode_walker_entry * i,struct bkey_s_c k)1440 static int check_key_has_inode(struct btree_trans *trans,
1441 struct btree_iter *iter,
1442 struct inode_walker *inode,
1443 struct inode_walker_entry *i,
1444 struct bkey_s_c k)
1445 {
1446 struct bch_fs *c = trans->c;
1447 struct printbuf buf = PRINTBUF;
1448 int ret = PTR_ERR_OR_ZERO(i);
1449 if (ret)
1450 return ret;
1451
1452 if (k.k->type == KEY_TYPE_whiteout)
1453 goto out;
1454
1455 if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1456 ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1457 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1458 if (ret)
1459 goto err;
1460
1461 inode->last_pos.inode--;
1462 ret = -BCH_ERR_transaction_restart_nested;
1463 goto err;
1464 }
1465
1466 if (fsck_err_on(!i,
1467 trans, key_in_missing_inode,
1468 "key in missing inode:\n%s",
1469 (printbuf_reset(&buf),
1470 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1471 goto delete;
1472
1473 if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1474 trans, key_in_wrong_inode_type,
1475 "key for wrong inode mode %o:\n%s",
1476 i->inode.bi_mode,
1477 (printbuf_reset(&buf),
1478 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1479 goto delete;
1480 out:
1481 err:
1482 fsck_err:
1483 printbuf_exit(&buf);
1484 bch_err_fn(c, ret);
1485 return ret;
1486 delete:
1487 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1488 goto out;
1489 }
1490
check_i_sectors_notnested(struct btree_trans * trans,struct inode_walker * w)1491 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1492 {
1493 struct bch_fs *c = trans->c;
1494 int ret = 0;
1495 s64 count2;
1496
1497 darray_for_each(w->inodes, i) {
1498 if (i->inode.bi_sectors == i->count)
1499 continue;
1500
1501 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1502
1503 if (w->recalculate_sums)
1504 i->count = count2;
1505
1506 if (i->count != count2) {
1507 bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1508 w->last_pos.inode, i->snapshot, i->count, count2);
1509 i->count = count2;
1510 }
1511
1512 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1513 trans, inode_i_sectors_wrong,
1514 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1515 w->last_pos.inode, i->snapshot,
1516 i->inode.bi_sectors, i->count)) {
1517 i->inode.bi_sectors = i->count;
1518 ret = bch2_fsck_write_inode(trans, &i->inode);
1519 if (ret)
1520 break;
1521 }
1522 }
1523 fsck_err:
1524 bch_err_fn(c, ret);
1525 return ret;
1526 }
1527
check_i_sectors(struct btree_trans * trans,struct inode_walker * w)1528 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1529 {
1530 u32 restart_count = trans->restart_count;
1531 return check_i_sectors_notnested(trans, w) ?:
1532 trans_was_restarted(trans, restart_count);
1533 }
1534
1535 struct extent_end {
1536 u32 snapshot;
1537 u64 offset;
1538 struct snapshots_seen seen;
1539 };
1540
1541 struct extent_ends {
1542 struct bpos last_pos;
1543 DARRAY(struct extent_end) e;
1544 };
1545
extent_ends_reset(struct extent_ends * extent_ends)1546 static void extent_ends_reset(struct extent_ends *extent_ends)
1547 {
1548 darray_for_each(extent_ends->e, i)
1549 snapshots_seen_exit(&i->seen);
1550 extent_ends->e.nr = 0;
1551 }
1552
extent_ends_exit(struct extent_ends * extent_ends)1553 static void extent_ends_exit(struct extent_ends *extent_ends)
1554 {
1555 extent_ends_reset(extent_ends);
1556 darray_exit(&extent_ends->e);
1557 }
1558
extent_ends_init(struct extent_ends * extent_ends)1559 static void extent_ends_init(struct extent_ends *extent_ends)
1560 {
1561 memset(extent_ends, 0, sizeof(*extent_ends));
1562 }
1563
extent_ends_at(struct bch_fs * c,struct extent_ends * extent_ends,struct snapshots_seen * seen,struct bkey_s_c k)1564 static int extent_ends_at(struct bch_fs *c,
1565 struct extent_ends *extent_ends,
1566 struct snapshots_seen *seen,
1567 struct bkey_s_c k)
1568 {
1569 struct extent_end *i, n = (struct extent_end) {
1570 .offset = k.k->p.offset,
1571 .snapshot = k.k->p.snapshot,
1572 .seen = *seen,
1573 };
1574
1575 n.seen.ids.data = kmemdup(seen->ids.data,
1576 sizeof(seen->ids.data[0]) * seen->ids.size,
1577 GFP_KERNEL);
1578 if (!n.seen.ids.data)
1579 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1580
1581 __darray_for_each(extent_ends->e, i) {
1582 if (i->snapshot == k.k->p.snapshot) {
1583 snapshots_seen_exit(&i->seen);
1584 *i = n;
1585 return 0;
1586 }
1587
1588 if (i->snapshot >= k.k->p.snapshot)
1589 break;
1590 }
1591
1592 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1593 }
1594
overlapping_extents_found(struct btree_trans * trans,enum btree_id btree,struct bpos pos1,struct snapshots_seen * pos1_seen,struct bkey pos2,bool * fixed,struct extent_end * extent_end)1595 static int overlapping_extents_found(struct btree_trans *trans,
1596 enum btree_id btree,
1597 struct bpos pos1, struct snapshots_seen *pos1_seen,
1598 struct bkey pos2,
1599 bool *fixed,
1600 struct extent_end *extent_end)
1601 {
1602 struct bch_fs *c = trans->c;
1603 struct printbuf buf = PRINTBUF;
1604 struct btree_iter iter1, iter2 = {};
1605 struct bkey_s_c k1, k2;
1606 int ret;
1607
1608 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1609
1610 bch2_trans_iter_init(trans, &iter1, btree, pos1,
1611 BTREE_ITER_all_snapshots|
1612 BTREE_ITER_not_extents);
1613 k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
1614 ret = bkey_err(k1);
1615 if (ret)
1616 goto err;
1617
1618 prt_newline(&buf);
1619 bch2_bkey_val_to_text(&buf, c, k1);
1620
1621 if (!bpos_eq(pos1, k1.k->p)) {
1622 prt_str(&buf, "\nwanted\n ");
1623 bch2_bpos_to_text(&buf, pos1);
1624 prt_str(&buf, "\n");
1625 bch2_bkey_to_text(&buf, &pos2);
1626
1627 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1628 __func__, buf.buf);
1629 ret = -BCH_ERR_internal_fsck_err;
1630 goto err;
1631 }
1632
1633 bch2_trans_copy_iter(trans, &iter2, &iter1);
1634
1635 while (1) {
1636 bch2_btree_iter_advance(trans, &iter2);
1637
1638 k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
1639 ret = bkey_err(k2);
1640 if (ret)
1641 goto err;
1642
1643 if (bpos_ge(k2.k->p, pos2.p))
1644 break;
1645 }
1646
1647 prt_newline(&buf);
1648 bch2_bkey_val_to_text(&buf, c, k2);
1649
1650 if (bpos_gt(k2.k->p, pos2.p) ||
1651 pos2.size != k2.k->size) {
1652 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1653 __func__, buf.buf);
1654 ret = -BCH_ERR_internal_fsck_err;
1655 goto err;
1656 }
1657
1658 prt_printf(&buf, "\noverwriting %s extent",
1659 pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1660
1661 if (fsck_err(trans, extent_overlapping,
1662 "overlapping extents%s", buf.buf)) {
1663 struct btree_iter *old_iter = &iter1;
1664 struct disk_reservation res = { 0 };
1665
1666 if (pos1.snapshot < pos2.p.snapshot) {
1667 old_iter = &iter2;
1668 swap(k1, k2);
1669 }
1670
1671 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1672
1673 ret = bch2_trans_update_extent_overwrite(trans, old_iter,
1674 BTREE_UPDATE_internal_snapshot_node,
1675 k1, k2) ?:
1676 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1677 bch2_disk_reservation_put(c, &res);
1678
1679 bch_info(c, "repair ret %s", bch2_err_str(ret));
1680
1681 if (ret)
1682 goto err;
1683
1684 *fixed = true;
1685
1686 if (pos1.snapshot == pos2.p.snapshot) {
1687 /*
1688 * We overwrote the first extent, and did the overwrite
1689 * in the same snapshot:
1690 */
1691 extent_end->offset = bkey_start_offset(&pos2);
1692 } else if (pos1.snapshot > pos2.p.snapshot) {
1693 /*
1694 * We overwrote the first extent in pos2's snapshot:
1695 */
1696 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1697 } else {
1698 /*
1699 * We overwrote the second extent - restart
1700 * check_extent() from the top:
1701 */
1702 ret = -BCH_ERR_transaction_restart_nested;
1703 }
1704 }
1705 fsck_err:
1706 err:
1707 bch2_trans_iter_exit(trans, &iter2);
1708 bch2_trans_iter_exit(trans, &iter1);
1709 printbuf_exit(&buf);
1710 return ret;
1711 }
1712
check_overlapping_extents(struct btree_trans * trans,struct snapshots_seen * seen,struct extent_ends * extent_ends,struct bkey_s_c k,struct btree_iter * iter,bool * fixed)1713 static int check_overlapping_extents(struct btree_trans *trans,
1714 struct snapshots_seen *seen,
1715 struct extent_ends *extent_ends,
1716 struct bkey_s_c k,
1717 struct btree_iter *iter,
1718 bool *fixed)
1719 {
1720 struct bch_fs *c = trans->c;
1721 int ret = 0;
1722
1723 /* transaction restart, running again */
1724 if (bpos_eq(extent_ends->last_pos, k.k->p))
1725 return 0;
1726
1727 if (extent_ends->last_pos.inode != k.k->p.inode)
1728 extent_ends_reset(extent_ends);
1729
1730 darray_for_each(extent_ends->e, i) {
1731 if (i->offset <= bkey_start_offset(k.k))
1732 continue;
1733
1734 if (!ref_visible2(c,
1735 k.k->p.snapshot, seen,
1736 i->snapshot, &i->seen))
1737 continue;
1738
1739 ret = overlapping_extents_found(trans, iter->btree_id,
1740 SPOS(iter->pos.inode,
1741 i->offset,
1742 i->snapshot),
1743 &i->seen,
1744 *k.k, fixed, i);
1745 if (ret)
1746 goto err;
1747 }
1748
1749 extent_ends->last_pos = k.k->p;
1750 err:
1751 return ret;
1752 }
1753
check_extent_overbig(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1754 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1755 struct bkey_s_c k)
1756 {
1757 struct bch_fs *c = trans->c;
1758 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1759 struct bch_extent_crc_unpacked crc;
1760 const union bch_extent_entry *i;
1761 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1762
1763 bkey_for_each_crc(k.k, ptrs, crc, i)
1764 if (crc_is_encoded(crc) &&
1765 crc.uncompressed_size > encoded_extent_max_sectors) {
1766 struct printbuf buf = PRINTBUF;
1767
1768 bch2_bkey_val_to_text(&buf, c, k);
1769 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
1770 printbuf_exit(&buf);
1771 }
1772
1773 return 0;
1774 }
1775
check_extent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct inode_walker * inode,struct snapshots_seen * s,struct extent_ends * extent_ends,struct disk_reservation * res)1776 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1777 struct bkey_s_c k,
1778 struct inode_walker *inode,
1779 struct snapshots_seen *s,
1780 struct extent_ends *extent_ends,
1781 struct disk_reservation *res)
1782 {
1783 struct bch_fs *c = trans->c;
1784 struct printbuf buf = PRINTBUF;
1785 int ret = 0;
1786
1787 ret = bch2_check_key_has_snapshot(trans, iter, k);
1788 if (ret) {
1789 ret = ret < 0 ? ret : 0;
1790 goto out;
1791 }
1792
1793 if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1794 ret = check_i_sectors(trans, inode);
1795 if (ret)
1796 goto err;
1797 }
1798
1799 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1800 if (ret)
1801 goto err;
1802
1803 struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1804 ret = PTR_ERR_OR_ZERO(extent_i);
1805 if (ret)
1806 goto err;
1807
1808 ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1809 if (ret)
1810 goto err;
1811
1812 if (k.k->type != KEY_TYPE_whiteout) {
1813 ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1814 &inode->recalculate_sums);
1815 if (ret)
1816 goto err;
1817
1818 /*
1819 * Check inodes in reverse order, from oldest snapshots to
1820 * newest, starting from the inode that matches this extent's
1821 * snapshot. If we didn't have one, iterate over all inodes:
1822 */
1823 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1824 inode->inodes.data && i >= inode->inodes.data;
1825 --i) {
1826 if (i->snapshot > k.k->p.snapshot ||
1827 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1828 continue;
1829
1830 if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1831 !bkey_extent_is_reservation(k),
1832 trans, extent_past_end_of_inode,
1833 "extent type past end of inode %llu:%u, i_size %llu\n%s",
1834 i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1835 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1836 struct btree_iter iter2;
1837
1838 bch2_trans_copy_iter(trans, &iter2, iter);
1839 bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
1840 ret = bch2_btree_iter_traverse(trans, &iter2) ?:
1841 bch2_btree_delete_at(trans, &iter2,
1842 BTREE_UPDATE_internal_snapshot_node);
1843 bch2_trans_iter_exit(trans, &iter2);
1844 if (ret)
1845 goto err;
1846
1847 iter->k.type = KEY_TYPE_whiteout;
1848 break;
1849 }
1850 }
1851 }
1852
1853 ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1854 if (ret)
1855 goto err;
1856
1857 if (bkey_extent_is_allocation(k.k)) {
1858 for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1859 inode->inodes.data && i >= inode->inodes.data;
1860 --i) {
1861 if (i->snapshot > k.k->p.snapshot ||
1862 !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1863 continue;
1864
1865 i->count += k.k->size;
1866 }
1867 }
1868
1869 if (k.k->type != KEY_TYPE_whiteout) {
1870 ret = extent_ends_at(c, extent_ends, s, k);
1871 if (ret)
1872 goto err;
1873 }
1874 out:
1875 err:
1876 fsck_err:
1877 printbuf_exit(&buf);
1878 bch_err_fn(c, ret);
1879 return ret;
1880 }
1881
1882 /*
1883 * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1884 * that i_size an i_sectors are consistent
1885 */
bch2_check_extents(struct bch_fs * c)1886 int bch2_check_extents(struct bch_fs *c)
1887 {
1888 struct inode_walker w = inode_walker_init();
1889 struct snapshots_seen s;
1890 struct extent_ends extent_ends;
1891 struct disk_reservation res = { 0 };
1892
1893 snapshots_seen_init(&s);
1894 extent_ends_init(&extent_ends);
1895
1896 int ret = bch2_trans_run(c,
1897 for_each_btree_key(trans, iter, BTREE_ID_extents,
1898 POS(BCACHEFS_ROOT_INO, 0),
1899 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1900 bch2_disk_reservation_put(c, &res);
1901 check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1902 check_extent_overbig(trans, &iter, k);
1903 })) ?:
1904 check_i_sectors_notnested(trans, &w));
1905
1906 bch2_disk_reservation_put(c, &res);
1907 extent_ends_exit(&extent_ends);
1908 inode_walker_exit(&w);
1909 snapshots_seen_exit(&s);
1910
1911 bch_err_fn(c, ret);
1912 return ret;
1913 }
1914
bch2_check_indirect_extents(struct bch_fs * c)1915 int bch2_check_indirect_extents(struct bch_fs *c)
1916 {
1917 struct disk_reservation res = { 0 };
1918
1919 int ret = bch2_trans_run(c,
1920 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1921 POS_MIN,
1922 BTREE_ITER_prefetch, k,
1923 &res, NULL,
1924 BCH_TRANS_COMMIT_no_enospc, ({
1925 bch2_disk_reservation_put(c, &res);
1926 check_extent_overbig(trans, &iter, k);
1927 })));
1928
1929 bch2_disk_reservation_put(c, &res);
1930 bch_err_fn(c, ret);
1931 return ret;
1932 }
1933
check_subdir_count_notnested(struct btree_trans * trans,struct inode_walker * w)1934 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1935 {
1936 struct bch_fs *c = trans->c;
1937 int ret = 0;
1938 s64 count2;
1939
1940 darray_for_each(w->inodes, i) {
1941 if (i->inode.bi_nlink == i->count)
1942 continue;
1943
1944 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1945 if (count2 < 0)
1946 return count2;
1947
1948 if (i->count != count2) {
1949 bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1950 w->last_pos.inode, i->snapshot, i->count, count2);
1951 i->count = count2;
1952 if (i->inode.bi_nlink == i->count)
1953 continue;
1954 }
1955
1956 if (fsck_err_on(i->inode.bi_nlink != i->count,
1957 trans, inode_dir_wrong_nlink,
1958 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1959 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1960 i->inode.bi_nlink = i->count;
1961 ret = bch2_fsck_write_inode(trans, &i->inode);
1962 if (ret)
1963 break;
1964 }
1965 }
1966 fsck_err:
1967 bch_err_fn(c, ret);
1968 return ret;
1969 }
1970
check_subdir_dirents_count(struct btree_trans * trans,struct inode_walker * w)1971 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
1972 {
1973 u32 restart_count = trans->restart_count;
1974 return check_subdir_count_notnested(trans, w) ?:
1975 trans_was_restarted(trans, restart_count);
1976 }
1977
1978 /* find a subvolume that's a descendent of @snapshot: */
find_snapshot_subvol(struct btree_trans * trans,u32 snapshot,u32 * subvolid)1979 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1980 {
1981 struct btree_iter iter;
1982 struct bkey_s_c k;
1983 int ret;
1984
1985 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1986 if (k.k->type != KEY_TYPE_subvolume)
1987 continue;
1988
1989 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1990 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1991 bch2_trans_iter_exit(trans, &iter);
1992 *subvolid = k.k->p.offset;
1993 goto found;
1994 }
1995 }
1996 if (!ret)
1997 ret = -ENOENT;
1998 found:
1999 bch2_trans_iter_exit(trans, &iter);
2000 return ret;
2001 }
2002
2003 noinline_for_stack
check_dirent_to_subvol(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d)2004 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2005 struct bkey_s_c_dirent d)
2006 {
2007 struct bch_fs *c = trans->c;
2008 struct btree_iter subvol_iter = {};
2009 struct bch_inode_unpacked subvol_root;
2010 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2011 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2012 u32 parent_snapshot;
2013 u32 new_parent_subvol = 0;
2014 u64 parent_inum;
2015 struct printbuf buf = PRINTBUF;
2016 int ret = 0;
2017
2018 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2019 if (ret && !bch2_err_matches(ret, ENOENT))
2020 return ret;
2021
2022 if (ret ||
2023 (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2024 int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2025 if (ret2 && !bch2_err_matches(ret, ENOENT))
2026 return ret2;
2027 }
2028
2029 if (ret &&
2030 !new_parent_subvol &&
2031 (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2032 /*
2033 * Couldn't find a subvol for dirent's snapshot - but we lost
2034 * subvols, so we need to reconstruct:
2035 */
2036 ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2037 if (ret)
2038 return ret;
2039
2040 parent_snapshot = d.k->p.snapshot;
2041 }
2042
2043 if (fsck_err_on(ret,
2044 trans, dirent_to_missing_parent_subvol,
2045 "dirent parent_subvol points to missing subvolume\n%s",
2046 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2047 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2048 trans, dirent_not_visible_in_parent_subvol,
2049 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2050 parent_snapshot,
2051 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2052 if (!new_parent_subvol) {
2053 bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2054 return -BCH_ERR_fsck_repair_unimplemented;
2055 }
2056
2057 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2058 ret = PTR_ERR_OR_ZERO(new_dirent);
2059 if (ret)
2060 goto err;
2061
2062 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2063 }
2064
2065 struct bkey_s_c_subvolume s =
2066 bch2_bkey_get_iter_typed(trans, &subvol_iter,
2067 BTREE_ID_subvolumes, POS(0, target_subvol),
2068 0, subvolume);
2069 ret = bkey_err(s.s_c);
2070 if (ret && !bch2_err_matches(ret, ENOENT))
2071 return ret;
2072
2073 if (ret) {
2074 if (fsck_err(trans, dirent_to_missing_subvol,
2075 "dirent points to missing subvolume\n%s",
2076 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2077 return bch2_fsck_remove_dirent(trans, d.k->p);
2078 ret = 0;
2079 goto out;
2080 }
2081
2082 if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2083 trans, subvol_fs_path_parent_wrong,
2084 "subvol with wrong fs_path_parent, should be be %u\n%s",
2085 parent_subvol,
2086 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2087 struct bkey_i_subvolume *n =
2088 bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2089 ret = PTR_ERR_OR_ZERO(n);
2090 if (ret)
2091 goto err;
2092
2093 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2094 }
2095
2096 u64 target_inum = le64_to_cpu(s.v->inode);
2097 u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2098
2099 ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2100 if (ret && !bch2_err_matches(ret, ENOENT))
2101 goto err;
2102
2103 if (ret) {
2104 bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2105 ret = -BCH_ERR_fsck_repair_unimplemented;
2106 goto err;
2107 }
2108
2109 if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2110 trans, inode_bi_parent_wrong,
2111 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2112 target_inum,
2113 subvol_root.bi_parent_subvol, parent_subvol)) {
2114 subvol_root.bi_parent_subvol = parent_subvol;
2115 subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2116 ret = __bch2_fsck_write_inode(trans, &subvol_root);
2117 if (ret)
2118 goto err;
2119 }
2120
2121 ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
2122 if (ret)
2123 goto err;
2124 out:
2125 err:
2126 fsck_err:
2127 bch2_trans_iter_exit(trans, &subvol_iter);
2128 printbuf_exit(&buf);
2129 return ret;
2130 }
2131
check_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * dir,struct inode_walker * target,struct snapshots_seen * s)2132 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2133 struct bkey_s_c k,
2134 struct bch_hash_info *hash_info,
2135 struct inode_walker *dir,
2136 struct inode_walker *target,
2137 struct snapshots_seen *s)
2138 {
2139 struct bch_fs *c = trans->c;
2140 struct inode_walker_entry *i;
2141 struct printbuf buf = PRINTBUF;
2142 int ret = 0;
2143
2144 ret = bch2_check_key_has_snapshot(trans, iter, k);
2145 if (ret) {
2146 ret = ret < 0 ? ret : 0;
2147 goto out;
2148 }
2149
2150 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2151 if (ret)
2152 goto err;
2153
2154 if (k.k->type == KEY_TYPE_whiteout)
2155 goto out;
2156
2157 if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2158 ret = check_subdir_dirents_count(trans, dir);
2159 if (ret)
2160 goto err;
2161 }
2162
2163 i = walk_inode(trans, dir, k);
2164 ret = PTR_ERR_OR_ZERO(i);
2165 if (ret < 0)
2166 goto err;
2167
2168 ret = check_key_has_inode(trans, iter, dir, i, k);
2169 if (ret)
2170 goto err;
2171
2172 if (!i)
2173 goto out;
2174
2175 if (dir->first_this_inode)
2176 *hash_info = bch2_hash_info_init(c, &i->inode);
2177 dir->first_this_inode = false;
2178
2179 ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2180 if (ret < 0)
2181 goto err;
2182 if (ret) {
2183 /* dirent has been deleted */
2184 ret = 0;
2185 goto out;
2186 }
2187
2188 if (k.k->type != KEY_TYPE_dirent)
2189 goto out;
2190
2191 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2192
2193 /* check casefold */
2194 if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
2195 trans, dirent_casefold_mismatch,
2196 "dirent casefold does not match dir casefold\n%s",
2197 (printbuf_reset(&buf),
2198 bch2_bkey_val_to_text(&buf, c, k),
2199 buf.buf))) {
2200 struct qstr name = bch2_dirent_get_name(d);
2201 u32 subvol = d.v->d_type == DT_SUBVOL
2202 ? d.v->d_parent_subvol
2203 : 0;
2204 u64 target = d.v->d_type == DT_SUBVOL
2205 ? d.v->d_child_subvol
2206 : d.v->d_inum;
2207 u64 dir_offset;
2208
2209 ret = bch2_hash_delete_at(trans,
2210 bch2_dirent_hash_desc, hash_info, iter,
2211 BTREE_UPDATE_internal_snapshot_node) ?:
2212 bch2_dirent_create_snapshot(trans, subvol,
2213 d.k->p.inode, d.k->p.snapshot,
2214 hash_info,
2215 d.v->d_type,
2216 &name,
2217 target,
2218 &dir_offset,
2219 BTREE_ITER_with_updates|
2220 BTREE_UPDATE_internal_snapshot_node|
2221 STR_HASH_must_create) ?:
2222 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2223
2224 /* might need another check_dirents pass */
2225 goto out;
2226 }
2227
2228 if (d.v->d_type == DT_SUBVOL) {
2229 ret = check_dirent_to_subvol(trans, iter, d);
2230 if (ret)
2231 goto err;
2232 } else {
2233 ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2234 if (ret)
2235 goto err;
2236
2237 if (fsck_err_on(!target->inodes.nr,
2238 trans, dirent_to_missing_inode,
2239 "dirent points to missing inode:\n%s",
2240 (printbuf_reset(&buf),
2241 bch2_bkey_val_to_text(&buf, c, k),
2242 buf.buf))) {
2243 ret = bch2_fsck_remove_dirent(trans, d.k->p);
2244 if (ret)
2245 goto err;
2246 }
2247
2248 darray_for_each(target->inodes, i) {
2249 ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
2250 if (ret)
2251 goto err;
2252 }
2253
2254 darray_for_each(target->deletes, i)
2255 if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2256 trans, dirent_to_overwritten_inode,
2257 "dirent points to inode overwritten in snapshot %u:\n%s",
2258 *i,
2259 (printbuf_reset(&buf),
2260 bch2_bkey_val_to_text(&buf, c, k),
2261 buf.buf))) {
2262 struct btree_iter delete_iter;
2263 bch2_trans_iter_init(trans, &delete_iter,
2264 BTREE_ID_dirents,
2265 SPOS(k.k->p.inode, k.k->p.offset, *i),
2266 BTREE_ITER_intent);
2267 ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
2268 bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2269 hash_info,
2270 &delete_iter,
2271 BTREE_UPDATE_internal_snapshot_node);
2272 bch2_trans_iter_exit(trans, &delete_iter);
2273 if (ret)
2274 goto err;
2275
2276 }
2277 }
2278
2279 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2280 if (ret)
2281 goto err;
2282
2283 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2284 if (d.v->d_type == DT_DIR)
2285 i->count++;
2286 i->i_size += bkey_bytes(d.k);
2287 }
2288 out:
2289 err:
2290 fsck_err:
2291 printbuf_exit(&buf);
2292 bch_err_fn(c, ret);
2293 return ret;
2294 }
2295
2296 /*
2297 * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2298 * validate d_type
2299 */
bch2_check_dirents(struct bch_fs * c)2300 int bch2_check_dirents(struct bch_fs *c)
2301 {
2302 struct inode_walker dir = inode_walker_init();
2303 struct inode_walker target = inode_walker_init();
2304 struct snapshots_seen s;
2305 struct bch_hash_info hash_info;
2306
2307 snapshots_seen_init(&s);
2308
2309 int ret = bch2_trans_run(c,
2310 for_each_btree_key(trans, iter, BTREE_ID_dirents,
2311 POS(BCACHEFS_ROOT_INO, 0),
2312 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2313 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2314 check_subdir_count_notnested(trans, &dir));
2315
2316 snapshots_seen_exit(&s);
2317 inode_walker_exit(&dir);
2318 inode_walker_exit(&target);
2319 bch_err_fn(c, ret);
2320 return ret;
2321 }
2322
check_xattr(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * inode)2323 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2324 struct bkey_s_c k,
2325 struct bch_hash_info *hash_info,
2326 struct inode_walker *inode)
2327 {
2328 struct bch_fs *c = trans->c;
2329 struct inode_walker_entry *i;
2330 int ret;
2331
2332 ret = bch2_check_key_has_snapshot(trans, iter, k);
2333 if (ret < 0)
2334 return ret;
2335 if (ret)
2336 return 0;
2337
2338 i = walk_inode(trans, inode, k);
2339 ret = PTR_ERR_OR_ZERO(i);
2340 if (ret)
2341 return ret;
2342
2343 ret = check_key_has_inode(trans, iter, inode, i, k);
2344 if (ret)
2345 return ret;
2346
2347 if (!i)
2348 return 0;
2349
2350 if (inode->first_this_inode)
2351 *hash_info = bch2_hash_info_init(c, &i->inode);
2352 inode->first_this_inode = false;
2353
2354 ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2355 bch_err_fn(c, ret);
2356 return ret;
2357 }
2358
2359 /*
2360 * Walk xattrs: verify that they all have a corresponding inode
2361 */
bch2_check_xattrs(struct bch_fs * c)2362 int bch2_check_xattrs(struct bch_fs *c)
2363 {
2364 struct inode_walker inode = inode_walker_init();
2365 struct bch_hash_info hash_info;
2366 int ret = 0;
2367
2368 ret = bch2_trans_run(c,
2369 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2370 POS(BCACHEFS_ROOT_INO, 0),
2371 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2372 k,
2373 NULL, NULL,
2374 BCH_TRANS_COMMIT_no_enospc,
2375 check_xattr(trans, &iter, k, &hash_info, &inode)));
2376
2377 inode_walker_exit(&inode);
2378 bch_err_fn(c, ret);
2379 return ret;
2380 }
2381
check_root_trans(struct btree_trans * trans)2382 static int check_root_trans(struct btree_trans *trans)
2383 {
2384 struct bch_fs *c = trans->c;
2385 struct bch_inode_unpacked root_inode;
2386 u32 snapshot;
2387 u64 inum;
2388 int ret;
2389
2390 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2391 if (ret && !bch2_err_matches(ret, ENOENT))
2392 return ret;
2393
2394 if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2395 "root subvol missing")) {
2396 struct bkey_i_subvolume *root_subvol =
2397 bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2398 ret = PTR_ERR_OR_ZERO(root_subvol);
2399 if (ret)
2400 goto err;
2401
2402 snapshot = U32_MAX;
2403 inum = BCACHEFS_ROOT_INO;
2404
2405 bkey_subvolume_init(&root_subvol->k_i);
2406 root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2407 root_subvol->v.flags = 0;
2408 root_subvol->v.snapshot = cpu_to_le32(snapshot);
2409 root_subvol->v.inode = cpu_to_le64(inum);
2410 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2411 bch_err_msg(c, ret, "writing root subvol");
2412 if (ret)
2413 goto err;
2414 }
2415
2416 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2417 if (ret && !bch2_err_matches(ret, ENOENT))
2418 return ret;
2419
2420 if (mustfix_fsck_err_on(ret,
2421 trans, root_dir_missing,
2422 "root directory missing") ||
2423 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2424 trans, root_inode_not_dir,
2425 "root inode not a directory")) {
2426 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2427 0, NULL);
2428 root_inode.bi_inum = inum;
2429 root_inode.bi_snapshot = snapshot;
2430
2431 ret = __bch2_fsck_write_inode(trans, &root_inode);
2432 bch_err_msg(c, ret, "writing root inode");
2433 }
2434 err:
2435 fsck_err:
2436 return ret;
2437 }
2438
2439 /* Get root directory, create if it doesn't exist: */
bch2_check_root(struct bch_fs * c)2440 int bch2_check_root(struct bch_fs *c)
2441 {
2442 int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2443 check_root_trans(trans));
2444 bch_err_fn(c, ret);
2445 return ret;
2446 }
2447
2448 typedef DARRAY(u32) darray_u32;
2449
darray_u32_has(darray_u32 * d,u32 v)2450 static bool darray_u32_has(darray_u32 *d, u32 v)
2451 {
2452 darray_for_each(*d, i)
2453 if (*i == v)
2454 return true;
2455 return false;
2456 }
2457
check_subvol_path(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2458 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2459 {
2460 struct bch_fs *c = trans->c;
2461 struct btree_iter parent_iter = {};
2462 darray_u32 subvol_path = {};
2463 struct printbuf buf = PRINTBUF;
2464 int ret = 0;
2465
2466 if (k.k->type != KEY_TYPE_subvolume)
2467 return 0;
2468
2469 while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2470 ret = darray_push(&subvol_path, k.k->p.offset);
2471 if (ret)
2472 goto err;
2473
2474 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2475
2476 struct bch_inode_unpacked subvol_root;
2477 ret = bch2_inode_find_by_inum_trans(trans,
2478 (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2479 &subvol_root);
2480 if (ret)
2481 break;
2482
2483 u32 parent = le32_to_cpu(s.v->fs_path_parent);
2484
2485 if (darray_u32_has(&subvol_path, parent)) {
2486 if (fsck_err(trans, subvol_loop, "subvolume loop"))
2487 ret = reattach_subvol(trans, s);
2488 break;
2489 }
2490
2491 bch2_trans_iter_exit(trans, &parent_iter);
2492 bch2_trans_iter_init(trans, &parent_iter,
2493 BTREE_ID_subvolumes, POS(0, parent), 0);
2494 k = bch2_btree_iter_peek_slot(trans, &parent_iter);
2495 ret = bkey_err(k);
2496 if (ret)
2497 goto err;
2498
2499 if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2500 trans, subvol_unreachable,
2501 "unreachable subvolume %s",
2502 (bch2_bkey_val_to_text(&buf, c, s.s_c),
2503 buf.buf))) {
2504 ret = reattach_subvol(trans, s);
2505 break;
2506 }
2507 }
2508 fsck_err:
2509 err:
2510 printbuf_exit(&buf);
2511 darray_exit(&subvol_path);
2512 bch2_trans_iter_exit(trans, &parent_iter);
2513 return ret;
2514 }
2515
bch2_check_subvolume_structure(struct bch_fs * c)2516 int bch2_check_subvolume_structure(struct bch_fs *c)
2517 {
2518 int ret = bch2_trans_run(c,
2519 for_each_btree_key_commit(trans, iter,
2520 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2521 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2522 check_subvol_path(trans, &iter, k)));
2523 bch_err_fn(c, ret);
2524 return ret;
2525 }
2526
2527 struct pathbuf_entry {
2528 u64 inum;
2529 u32 snapshot;
2530 };
2531
2532 typedef DARRAY(struct pathbuf_entry) pathbuf;
2533
bch2_bi_depth_renumber_one(struct btree_trans * trans,struct pathbuf_entry * p,u32 new_depth)2534 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2535 u32 new_depth)
2536 {
2537 struct btree_iter iter;
2538 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2539 SPOS(0, p->inum, p->snapshot), 0);
2540
2541 struct bch_inode_unpacked inode;
2542 int ret = bkey_err(k) ?:
2543 !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2544 : bch2_inode_unpack(k, &inode);
2545 if (ret)
2546 goto err;
2547
2548 if (inode.bi_depth != new_depth) {
2549 inode.bi_depth = new_depth;
2550 ret = __bch2_fsck_write_inode(trans, &inode) ?:
2551 bch2_trans_commit(trans, NULL, NULL, 0);
2552 }
2553 err:
2554 bch2_trans_iter_exit(trans, &iter);
2555 return ret;
2556 }
2557
bch2_bi_depth_renumber(struct btree_trans * trans,pathbuf * path,u32 new_bi_depth)2558 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2559 {
2560 u32 restart_count = trans->restart_count;
2561 int ret = 0;
2562
2563 darray_for_each_reverse(*path, i) {
2564 ret = nested_lockrestart_do(trans,
2565 bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2566 bch_err_fn(trans->c, ret);
2567 if (ret)
2568 break;
2569
2570 new_bi_depth++;
2571 }
2572
2573 return ret ?: trans_was_restarted(trans, restart_count);
2574 }
2575
path_is_dup(pathbuf * p,u64 inum,u32 snapshot)2576 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2577 {
2578 darray_for_each(*p, i)
2579 if (i->inum == inum &&
2580 i->snapshot == snapshot)
2581 return true;
2582 return false;
2583 }
2584
check_path_loop(struct btree_trans * trans,struct bkey_s_c inode_k)2585 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2586 {
2587 struct bch_fs *c = trans->c;
2588 struct btree_iter inode_iter = {};
2589 pathbuf path = {};
2590 struct printbuf buf = PRINTBUF;
2591 u32 snapshot = inode_k.k->p.snapshot;
2592 bool redo_bi_depth = false;
2593 u32 min_bi_depth = U32_MAX;
2594 int ret = 0;
2595
2596 struct bch_inode_unpacked inode;
2597 ret = bch2_inode_unpack(inode_k, &inode);
2598 if (ret)
2599 return ret;
2600
2601 while (!inode.bi_subvol) {
2602 struct btree_iter dirent_iter;
2603 struct bkey_s_c_dirent d;
2604 u32 parent_snapshot = snapshot;
2605
2606 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2607 ret = bkey_err(d.s_c);
2608 if (ret && !bch2_err_matches(ret, ENOENT))
2609 goto out;
2610
2611 if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2612 bch2_trans_iter_exit(trans, &dirent_iter);
2613
2614 if (bch2_err_matches(ret, ENOENT)) {
2615 printbuf_reset(&buf);
2616 bch2_bkey_val_to_text(&buf, c, inode_k);
2617 bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2618 bch2_err_str(ret), buf.buf);
2619 goto out;
2620 }
2621
2622 bch2_trans_iter_exit(trans, &dirent_iter);
2623
2624 ret = darray_push(&path, ((struct pathbuf_entry) {
2625 .inum = inode.bi_inum,
2626 .snapshot = snapshot,
2627 }));
2628 if (ret)
2629 return ret;
2630
2631 snapshot = parent_snapshot;
2632
2633 bch2_trans_iter_exit(trans, &inode_iter);
2634 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2635 SPOS(0, inode.bi_dir, snapshot), 0);
2636
2637 struct bch_inode_unpacked parent_inode;
2638 ret = bkey_err(inode_k) ?:
2639 !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2640 : bch2_inode_unpack(inode_k, &parent_inode);
2641 if (ret) {
2642 /* Should have been caught in dirents pass */
2643 bch_err_msg(c, ret, "error looking up parent directory");
2644 goto out;
2645 }
2646
2647 min_bi_depth = parent_inode.bi_depth;
2648
2649 if (parent_inode.bi_depth < inode.bi_depth &&
2650 min_bi_depth < U16_MAX)
2651 break;
2652
2653 inode = parent_inode;
2654 snapshot = inode_k.k->p.snapshot;
2655 redo_bi_depth = true;
2656
2657 if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2658 /* XXX print path */
2659 bch_err(c, "directory structure loop");
2660
2661 darray_for_each(path, i)
2662 pr_err("%llu:%u", i->inum, i->snapshot);
2663 pr_err("%llu:%u", inode.bi_inum, snapshot);
2664
2665 if (fsck_err(trans, dir_loop, "directory structure loop")) {
2666 ret = remove_backpointer(trans, &inode);
2667 bch_err_msg(c, ret, "removing dirent");
2668 if (ret)
2669 break;
2670
2671 ret = reattach_inode(trans, &inode);
2672 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2673 }
2674
2675 goto out;
2676 }
2677 }
2678
2679 if (inode.bi_subvol)
2680 min_bi_depth = 0;
2681
2682 if (redo_bi_depth)
2683 ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2684 out:
2685 fsck_err:
2686 bch2_trans_iter_exit(trans, &inode_iter);
2687 darray_exit(&path);
2688 printbuf_exit(&buf);
2689 bch_err_fn(c, ret);
2690 return ret;
2691 }
2692
2693 /*
2694 * Check for loops in the directory structure: all other connectivity issues
2695 * have been fixed by prior passes
2696 */
bch2_check_directory_structure(struct bch_fs * c)2697 int bch2_check_directory_structure(struct bch_fs *c)
2698 {
2699 int ret = bch2_trans_run(c,
2700 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2701 BTREE_ITER_intent|
2702 BTREE_ITER_prefetch|
2703 BTREE_ITER_all_snapshots, k,
2704 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2705 if (!S_ISDIR(bkey_inode_mode(k)))
2706 continue;
2707
2708 if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2709 continue;
2710
2711 check_path_loop(trans, k);
2712 })));
2713
2714 bch_err_fn(c, ret);
2715 return ret;
2716 }
2717
2718 struct nlink_table {
2719 size_t nr;
2720 size_t size;
2721
2722 struct nlink {
2723 u64 inum;
2724 u32 snapshot;
2725 u32 count;
2726 } *d;
2727 };
2728
add_nlink(struct bch_fs * c,struct nlink_table * t,u64 inum,u32 snapshot)2729 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2730 u64 inum, u32 snapshot)
2731 {
2732 if (t->nr == t->size) {
2733 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2734 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2735
2736 if (!d) {
2737 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2738 new_size);
2739 return -BCH_ERR_ENOMEM_fsck_add_nlink;
2740 }
2741
2742 if (t->d)
2743 memcpy(d, t->d, t->size * sizeof(t->d[0]));
2744 kvfree(t->d);
2745
2746 t->d = d;
2747 t->size = new_size;
2748 }
2749
2750
2751 t->d[t->nr++] = (struct nlink) {
2752 .inum = inum,
2753 .snapshot = snapshot,
2754 };
2755
2756 return 0;
2757 }
2758
nlink_cmp(const void * _l,const void * _r)2759 static int nlink_cmp(const void *_l, const void *_r)
2760 {
2761 const struct nlink *l = _l;
2762 const struct nlink *r = _r;
2763
2764 return cmp_int(l->inum, r->inum);
2765 }
2766
inc_link(struct bch_fs * c,struct snapshots_seen * s,struct nlink_table * links,u64 range_start,u64 range_end,u64 inum,u32 snapshot)2767 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2768 struct nlink_table *links,
2769 u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2770 {
2771 struct nlink *link, key = {
2772 .inum = inum, .snapshot = U32_MAX,
2773 };
2774
2775 if (inum < range_start || inum >= range_end)
2776 return;
2777
2778 link = __inline_bsearch(&key, links->d, links->nr,
2779 sizeof(links->d[0]), nlink_cmp);
2780 if (!link)
2781 return;
2782
2783 while (link > links->d && link[0].inum == link[-1].inum)
2784 --link;
2785
2786 for (; link < links->d + links->nr && link->inum == inum; link++)
2787 if (ref_visible(c, s, snapshot, link->snapshot)) {
2788 link->count++;
2789 if (link->snapshot >= snapshot)
2790 break;
2791 }
2792 }
2793
2794 noinline_for_stack
check_nlinks_find_hardlinks(struct bch_fs * c,struct nlink_table * t,u64 start,u64 * end)2795 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2796 struct nlink_table *t,
2797 u64 start, u64 *end)
2798 {
2799 int ret = bch2_trans_run(c,
2800 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2801 POS(0, start),
2802 BTREE_ITER_intent|
2803 BTREE_ITER_prefetch|
2804 BTREE_ITER_all_snapshots, k, ({
2805 if (!bkey_is_inode(k.k))
2806 continue;
2807
2808 /* Should never fail, checked by bch2_inode_invalid: */
2809 struct bch_inode_unpacked u;
2810 _ret3 = bch2_inode_unpack(k, &u);
2811 if (_ret3)
2812 break;
2813
2814 /*
2815 * Backpointer and directory structure checks are sufficient for
2816 * directories, since they can't have hardlinks:
2817 */
2818 if (S_ISDIR(u.bi_mode))
2819 continue;
2820
2821 /*
2822 * Previous passes ensured that bi_nlink is nonzero if
2823 * it had multiple hardlinks:
2824 */
2825 if (!u.bi_nlink)
2826 continue;
2827
2828 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2829 if (ret) {
2830 *end = k.k->p.offset;
2831 ret = 0;
2832 break;
2833 }
2834 0;
2835 })));
2836
2837 bch_err_fn(c, ret);
2838 return ret;
2839 }
2840
2841 noinline_for_stack
check_nlinks_walk_dirents(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2842 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2843 u64 range_start, u64 range_end)
2844 {
2845 struct snapshots_seen s;
2846
2847 snapshots_seen_init(&s);
2848
2849 int ret = bch2_trans_run(c,
2850 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2851 BTREE_ITER_intent|
2852 BTREE_ITER_prefetch|
2853 BTREE_ITER_all_snapshots, k, ({
2854 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2855 if (ret)
2856 break;
2857
2858 if (k.k->type == KEY_TYPE_dirent) {
2859 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2860
2861 if (d.v->d_type != DT_DIR &&
2862 d.v->d_type != DT_SUBVOL)
2863 inc_link(c, &s, links, range_start, range_end,
2864 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2865 }
2866 0;
2867 })));
2868
2869 snapshots_seen_exit(&s);
2870
2871 bch_err_fn(c, ret);
2872 return ret;
2873 }
2874
check_nlinks_update_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct nlink_table * links,size_t * idx,u64 range_end)2875 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2876 struct bkey_s_c k,
2877 struct nlink_table *links,
2878 size_t *idx, u64 range_end)
2879 {
2880 struct bch_inode_unpacked u;
2881 struct nlink *link = &links->d[*idx];
2882 int ret = 0;
2883
2884 if (k.k->p.offset >= range_end)
2885 return 1;
2886
2887 if (!bkey_is_inode(k.k))
2888 return 0;
2889
2890 ret = bch2_inode_unpack(k, &u);
2891 if (ret)
2892 return ret;
2893
2894 if (S_ISDIR(u.bi_mode))
2895 return 0;
2896
2897 if (!u.bi_nlink)
2898 return 0;
2899
2900 while ((cmp_int(link->inum, k.k->p.offset) ?:
2901 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2902 BUG_ON(*idx == links->nr);
2903 link = &links->d[++*idx];
2904 }
2905
2906 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2907 trans, inode_wrong_nlink,
2908 "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2909 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2910 bch2_inode_nlink_get(&u), link->count)) {
2911 bch2_inode_nlink_set(&u, link->count);
2912 ret = __bch2_fsck_write_inode(trans, &u);
2913 }
2914 fsck_err:
2915 return ret;
2916 }
2917
2918 noinline_for_stack
check_nlinks_update_hardlinks(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2919 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2920 struct nlink_table *links,
2921 u64 range_start, u64 range_end)
2922 {
2923 size_t idx = 0;
2924
2925 int ret = bch2_trans_run(c,
2926 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2927 POS(0, range_start),
2928 BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2929 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2930 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2931 if (ret < 0) {
2932 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2933 return ret;
2934 }
2935
2936 return 0;
2937 }
2938
bch2_check_nlinks(struct bch_fs * c)2939 int bch2_check_nlinks(struct bch_fs *c)
2940 {
2941 struct nlink_table links = { 0 };
2942 u64 this_iter_range_start, next_iter_range_start = 0;
2943 int ret = 0;
2944
2945 do {
2946 this_iter_range_start = next_iter_range_start;
2947 next_iter_range_start = U64_MAX;
2948
2949 ret = check_nlinks_find_hardlinks(c, &links,
2950 this_iter_range_start,
2951 &next_iter_range_start);
2952
2953 ret = check_nlinks_walk_dirents(c, &links,
2954 this_iter_range_start,
2955 next_iter_range_start);
2956 if (ret)
2957 break;
2958
2959 ret = check_nlinks_update_hardlinks(c, &links,
2960 this_iter_range_start,
2961 next_iter_range_start);
2962 if (ret)
2963 break;
2964
2965 links.nr = 0;
2966 } while (next_iter_range_start != U64_MAX);
2967
2968 kvfree(links.d);
2969 bch_err_fn(c, ret);
2970 return ret;
2971 }
2972
fix_reflink_p_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2973 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2974 struct bkey_s_c k)
2975 {
2976 struct bkey_s_c_reflink_p p;
2977 struct bkey_i_reflink_p *u;
2978
2979 if (k.k->type != KEY_TYPE_reflink_p)
2980 return 0;
2981
2982 p = bkey_s_c_to_reflink_p(k);
2983
2984 if (!p.v->front_pad && !p.v->back_pad)
2985 return 0;
2986
2987 u = bch2_trans_kmalloc(trans, sizeof(*u));
2988 int ret = PTR_ERR_OR_ZERO(u);
2989 if (ret)
2990 return ret;
2991
2992 bkey_reassemble(&u->k_i, k);
2993 u->v.front_pad = 0;
2994 u->v.back_pad = 0;
2995
2996 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2997 }
2998
bch2_fix_reflink_p(struct bch_fs * c)2999 int bch2_fix_reflink_p(struct bch_fs *c)
3000 {
3001 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3002 return 0;
3003
3004 int ret = bch2_trans_run(c,
3005 for_each_btree_key_commit(trans, iter,
3006 BTREE_ID_extents, POS_MIN,
3007 BTREE_ITER_intent|BTREE_ITER_prefetch|
3008 BTREE_ITER_all_snapshots, k,
3009 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3010 fix_reflink_p_key(trans, &iter, k)));
3011 bch_err_fn(c, ret);
3012 return ret;
3013 }
3014
3015 #ifndef NO_BCACHEFS_CHARDEV
3016
3017 struct fsck_thread {
3018 struct thread_with_stdio thr;
3019 struct bch_fs *c;
3020 struct bch_opts opts;
3021 };
3022
bch2_fsck_thread_exit(struct thread_with_stdio * _thr)3023 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3024 {
3025 struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3026 kfree(thr);
3027 }
3028
bch2_fsck_offline_thread_fn(struct thread_with_stdio * stdio)3029 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3030 {
3031 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3032 struct bch_fs *c = thr->c;
3033
3034 int ret = PTR_ERR_OR_ZERO(c);
3035 if (ret)
3036 return ret;
3037
3038 ret = bch2_fs_start(thr->c);
3039 if (ret)
3040 goto err;
3041
3042 if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3043 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3044 ret |= 1;
3045 }
3046 if (test_bit(BCH_FS_error, &c->flags)) {
3047 bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3048 ret |= 4;
3049 }
3050 err:
3051 bch2_fs_stop(c);
3052 return ret;
3053 }
3054
3055 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3056 .exit = bch2_fsck_thread_exit,
3057 .fn = bch2_fsck_offline_thread_fn,
3058 };
3059
bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user * user_arg)3060 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3061 {
3062 struct bch_ioctl_fsck_offline arg;
3063 struct fsck_thread *thr = NULL;
3064 darray_str(devs) = {};
3065 long ret = 0;
3066
3067 if (copy_from_user(&arg, user_arg, sizeof(arg)))
3068 return -EFAULT;
3069
3070 if (arg.flags)
3071 return -EINVAL;
3072
3073 if (!capable(CAP_SYS_ADMIN))
3074 return -EPERM;
3075
3076 for (size_t i = 0; i < arg.nr_devs; i++) {
3077 u64 dev_u64;
3078 ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3079 if (ret)
3080 goto err;
3081
3082 char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3083 ret = PTR_ERR_OR_ZERO(dev_str);
3084 if (ret)
3085 goto err;
3086
3087 ret = darray_push(&devs, dev_str);
3088 if (ret) {
3089 kfree(dev_str);
3090 goto err;
3091 }
3092 }
3093
3094 thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3095 if (!thr) {
3096 ret = -ENOMEM;
3097 goto err;
3098 }
3099
3100 thr->opts = bch2_opts_empty();
3101
3102 if (arg.opts) {
3103 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3104 ret = PTR_ERR_OR_ZERO(optstr) ?:
3105 bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
3106 if (!IS_ERR(optstr))
3107 kfree(optstr);
3108
3109 if (ret)
3110 goto err;
3111 }
3112
3113 opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3114 opt_set(thr->opts, read_only, 1);
3115 opt_set(thr->opts, ratelimit_errors, 0);
3116
3117 /* We need request_key() to be called before we punt to kthread: */
3118 opt_set(thr->opts, nostart, true);
3119
3120 bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3121
3122 thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3123
3124 if (!IS_ERR(thr->c) &&
3125 thr->c->opts.errors == BCH_ON_ERROR_panic)
3126 thr->c->opts.errors = BCH_ON_ERROR_ro;
3127
3128 ret = __bch2_run_thread_with_stdio(&thr->thr);
3129 out:
3130 darray_for_each(devs, i)
3131 kfree(*i);
3132 darray_exit(&devs);
3133 return ret;
3134 err:
3135 if (thr)
3136 bch2_fsck_thread_exit(&thr->thr);
3137 pr_err("ret %s", bch2_err_str(ret));
3138 goto out;
3139 }
3140
bch2_fsck_online_thread_fn(struct thread_with_stdio * stdio)3141 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3142 {
3143 struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3144 struct bch_fs *c = thr->c;
3145
3146 c->stdio_filter = current;
3147 c->stdio = &thr->thr.stdio;
3148
3149 /*
3150 * XXX: can we figure out a way to do this without mucking with c->opts?
3151 */
3152 unsigned old_fix_errors = c->opts.fix_errors;
3153 if (opt_defined(thr->opts, fix_errors))
3154 c->opts.fix_errors = thr->opts.fix_errors;
3155 else
3156 c->opts.fix_errors = FSCK_FIX_ask;
3157
3158 c->opts.fsck = true;
3159 set_bit(BCH_FS_fsck_running, &c->flags);
3160
3161 c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3162 int ret = bch2_run_online_recovery_passes(c);
3163
3164 clear_bit(BCH_FS_fsck_running, &c->flags);
3165 bch_err_fn(c, ret);
3166
3167 c->stdio = NULL;
3168 c->stdio_filter = NULL;
3169 c->opts.fix_errors = old_fix_errors;
3170
3171 up(&c->online_fsck_mutex);
3172 bch2_ro_ref_put(c);
3173 return ret;
3174 }
3175
3176 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3177 .exit = bch2_fsck_thread_exit,
3178 .fn = bch2_fsck_online_thread_fn,
3179 };
3180
bch2_ioctl_fsck_online(struct bch_fs * c,struct bch_ioctl_fsck_online arg)3181 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3182 {
3183 struct fsck_thread *thr = NULL;
3184 long ret = 0;
3185
3186 if (arg.flags)
3187 return -EINVAL;
3188
3189 if (!capable(CAP_SYS_ADMIN))
3190 return -EPERM;
3191
3192 if (!bch2_ro_ref_tryget(c))
3193 return -EROFS;
3194
3195 if (down_trylock(&c->online_fsck_mutex)) {
3196 bch2_ro_ref_put(c);
3197 return -EAGAIN;
3198 }
3199
3200 thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3201 if (!thr) {
3202 ret = -ENOMEM;
3203 goto err;
3204 }
3205
3206 thr->c = c;
3207 thr->opts = bch2_opts_empty();
3208
3209 if (arg.opts) {
3210 char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3211
3212 ret = PTR_ERR_OR_ZERO(optstr) ?:
3213 bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
3214 if (!IS_ERR(optstr))
3215 kfree(optstr);
3216
3217 if (ret)
3218 goto err;
3219 }
3220
3221 ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3222 err:
3223 if (ret < 0) {
3224 bch_err_fn(c, ret);
3225 if (thr)
3226 bch2_fsck_thread_exit(&thr->thr);
3227 up(&c->online_fsck_mutex);
3228 bch2_ro_ref_put(c);
3229 }
3230 return ret;
3231 }
3232
3233 #endif /* NO_BCACHEFS_CHARDEV */
3234