1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bcachefs_ioctl.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_update.h"
8 #include "buckets.h"
9 #include "darray.h"
10 #include "dirent.h"
11 #include "error.h"
12 #include "fs.h"
13 #include "fsck.h"
14 #include "inode.h"
15 #include "keylist.h"
16 #include "namei.h"
17 #include "recovery_passes.h"
18 #include "snapshot.h"
19 #include "super.h"
20 #include "thread_with_file.h"
21 #include "xattr.h"
22 
23 #include <linux/bsearch.h>
24 #include <linux/dcache.h> /* struct qstr */
25 
dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,struct bch_inode_unpacked * inode)26 static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
27 					 struct bch_inode_unpacked *inode)
28 {
29 	if (d.v->d_type == DT_SUBVOL
30 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
31 	    : le64_to_cpu(d.v->d_inum)		== inode->bi_inum)
32 		return 0;
33 	return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
34 }
35 
dirent_inode_mismatch_msg(struct printbuf * out,struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)36 static void dirent_inode_mismatch_msg(struct printbuf *out,
37 				      struct bch_fs *c,
38 				      struct bkey_s_c_dirent dirent,
39 				      struct bch_inode_unpacked *inode)
40 {
41 	prt_str(out, "inode points to dirent that does not point back:");
42 	prt_newline(out);
43 	bch2_bkey_val_to_text(out, c, dirent.s_c);
44 	prt_newline(out);
45 	bch2_inode_unpacked_to_text(out, inode);
46 }
47 
dirent_points_to_inode(struct bch_fs * c,struct bkey_s_c_dirent dirent,struct bch_inode_unpacked * inode)48 static int dirent_points_to_inode(struct bch_fs *c,
49 				  struct bkey_s_c_dirent dirent,
50 				  struct bch_inode_unpacked *inode)
51 {
52 	int ret = dirent_points_to_inode_nowarn(dirent, inode);
53 	if (ret) {
54 		struct printbuf buf = PRINTBUF;
55 		dirent_inode_mismatch_msg(&buf, c, dirent, inode);
56 		bch_warn(c, "%s", buf.buf);
57 		printbuf_exit(&buf);
58 	}
59 	return ret;
60 }
61 
62 /*
63  * XXX: this is handling transaction restarts without returning
64  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
65  */
bch2_count_inode_sectors(struct btree_trans * trans,u64 inum,u32 snapshot)66 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
67 				    u32 snapshot)
68 {
69 	u64 sectors = 0;
70 
71 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
72 				SPOS(inum, 0, snapshot),
73 				POS(inum, U64_MAX),
74 				0, k, ({
75 		if (bkey_extent_is_allocation(k.k))
76 			sectors += k.k->size;
77 		0;
78 	}));
79 
80 	return ret ?: sectors;
81 }
82 
bch2_count_subdirs(struct btree_trans * trans,u64 inum,u32 snapshot)83 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
84 				    u32 snapshot)
85 {
86 	u64 subdirs = 0;
87 
88 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents,
89 				    SPOS(inum, 0, snapshot),
90 				    POS(inum, U64_MAX),
91 				    0, k, ({
92 		if (k.k->type == KEY_TYPE_dirent &&
93 		    bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
94 			subdirs++;
95 		0;
96 	}));
97 
98 	return ret ?: subdirs;
99 }
100 
subvol_lookup(struct btree_trans * trans,u32 subvol,u32 * snapshot,u64 * inum)101 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
102 			 u32 *snapshot, u64 *inum)
103 {
104 	struct bch_subvolume s;
105 	int ret = bch2_subvolume_get(trans, subvol, false, &s);
106 
107 	*snapshot = le32_to_cpu(s.snapshot);
108 	*inum = le64_to_cpu(s.inode);
109 	return ret;
110 }
111 
lookup_inode(struct btree_trans * trans,u64 inode_nr,u32 snapshot,struct bch_inode_unpacked * inode)112 static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
113 			struct bch_inode_unpacked *inode)
114 {
115 	struct btree_iter iter;
116 	struct bkey_s_c k;
117 	int ret;
118 
119 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
120 			       SPOS(0, inode_nr, snapshot), 0);
121 	ret = bkey_err(k);
122 	if (ret)
123 		goto err;
124 
125 	ret = bkey_is_inode(k.k)
126 		? bch2_inode_unpack(k, inode)
127 		: -BCH_ERR_ENOENT_inode;
128 err:
129 	bch2_trans_iter_exit(trans, &iter);
130 	return ret;
131 }
132 
lookup_dirent_in_snapshot(struct btree_trans * trans,struct bch_hash_info hash_info,subvol_inum dir,struct qstr * name,u64 * target,unsigned * type,u32 snapshot)133 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
134 			   struct bch_hash_info hash_info,
135 			   subvol_inum dir, struct qstr *name,
136 			   u64 *target, unsigned *type, u32 snapshot)
137 {
138 	struct btree_iter iter;
139 	struct bkey_s_c k = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
140 							 &hash_info, dir, name, 0, snapshot);
141 	int ret = bkey_err(k);
142 	if (ret)
143 		return ret;
144 
145 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
146 	*target = le64_to_cpu(d.v->d_inum);
147 	*type = d.v->d_type;
148 	bch2_trans_iter_exit(trans, &iter);
149 	return 0;
150 }
151 
152 /*
153  * Find any subvolume associated with a tree of snapshots
154  * We can't rely on master_subvol - it might have been deleted.
155  */
find_snapshot_tree_subvol(struct btree_trans * trans,u32 tree_id,u32 * subvol)156 static int find_snapshot_tree_subvol(struct btree_trans *trans,
157 				     u32 tree_id, u32 *subvol)
158 {
159 	struct btree_iter iter;
160 	struct bkey_s_c k;
161 	int ret;
162 
163 	for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) {
164 		if (k.k->type != KEY_TYPE_snapshot)
165 			continue;
166 
167 		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
168 		if (le32_to_cpu(s.v->tree) != tree_id)
169 			continue;
170 
171 		if (s.v->subvol) {
172 			*subvol = le32_to_cpu(s.v->subvol);
173 			goto found;
174 		}
175 	}
176 	ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
177 found:
178 	bch2_trans_iter_exit(trans, &iter);
179 	return ret;
180 }
181 
182 /* Get lost+found, create if it doesn't exist: */
lookup_lostfound(struct btree_trans * trans,u32 snapshot,struct bch_inode_unpacked * lostfound,u64 reattaching_inum)183 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
184 			    struct bch_inode_unpacked *lostfound,
185 			    u64 reattaching_inum)
186 {
187 	struct bch_fs *c = trans->c;
188 	struct qstr lostfound_str = QSTR("lost+found");
189 	struct btree_iter lostfound_iter = {};
190 	u64 inum = 0;
191 	unsigned d_type = 0;
192 	int ret;
193 
194 	struct bch_snapshot_tree st;
195 	ret = bch2_snapshot_tree_lookup(trans,
196 			bch2_snapshot_tree(c, snapshot), &st);
197 	if (ret)
198 		return ret;
199 
200 	u32 subvolid;
201 	ret = find_snapshot_tree_subvol(trans,
202 				bch2_snapshot_tree(c, snapshot), &subvolid);
203 	bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u",
204 		    bch2_snapshot_tree(c, snapshot));
205 	if (ret)
206 		return ret;
207 
208 	struct bch_subvolume subvol;
209 	ret = bch2_subvolume_get(trans, subvolid, false, &subvol);
210 	bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot);
211 	if (ret)
212 		return ret;
213 
214 	if (!subvol.inode) {
215 		struct btree_iter iter;
216 		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
217 				BTREE_ID_subvolumes, POS(0, subvolid),
218 				0, subvolume);
219 		ret = PTR_ERR_OR_ZERO(subvol);
220 		if (ret)
221 			return ret;
222 
223 		subvol->v.inode = cpu_to_le64(reattaching_inum);
224 		bch2_trans_iter_exit(trans, &iter);
225 	}
226 
227 	subvol_inum root_inum = {
228 		.subvol = subvolid,
229 		.inum = le64_to_cpu(subvol.inode)
230 	};
231 
232 	struct bch_inode_unpacked root_inode;
233 	struct bch_hash_info root_hash_info;
234 	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
235 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
236 		    root_inum.inum, subvolid);
237 	if (ret)
238 		return ret;
239 
240 	root_hash_info = bch2_hash_info_init(c, &root_inode);
241 
242 	ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
243 			      &lostfound_str, &inum, &d_type, snapshot);
244 	if (bch2_err_matches(ret, ENOENT))
245 		goto create_lostfound;
246 
247 	bch_err_fn(c, ret);
248 	if (ret)
249 		return ret;
250 
251 	if (d_type != DT_DIR) {
252 		bch_err(c, "error looking up lost+found: not a directory");
253 		return -BCH_ERR_ENOENT_not_directory;
254 	}
255 
256 	/*
257 	 * The bch2_check_dirents pass has already run, dangling dirents
258 	 * shouldn't exist here:
259 	 */
260 	ret = lookup_inode(trans, inum, snapshot, lostfound);
261 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
262 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
263 	return ret;
264 
265 create_lostfound:
266 	/*
267 	 * we always create lost+found in the root snapshot; we don't want
268 	 * different branches of the snapshot tree to have different lost+found
269 	 */
270 	snapshot = le32_to_cpu(st.root_snapshot);
271 	/*
272 	 * XXX: we could have a nicer log message here  if we had a nice way to
273 	 * walk backpointers to print a path
274 	 */
275 	struct printbuf path = PRINTBUF;
276 	ret = bch2_inum_to_path(trans, root_inum, &path);
277 	if (ret)
278 		goto err;
279 
280 	bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u",
281 		   path.buf, root_inum.subvol, snapshot);
282 	printbuf_exit(&path);
283 
284 	u64 now = bch2_current_time(c);
285 	u64 cpu = raw_smp_processor_id();
286 
287 	bch2_inode_init_early(c, lostfound);
288 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
289 	lostfound->bi_dir = root_inode.bi_inum;
290 	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
291 
292 	root_inode.bi_nlink++;
293 
294 	ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
295 	if (ret)
296 		goto err;
297 
298 	bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
299 	ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
300 	if (ret)
301 		goto err;
302 
303 	ret =   bch2_dirent_create_snapshot(trans,
304 				0, root_inode.bi_inum, snapshot, &root_hash_info,
305 				mode_to_type(lostfound->bi_mode),
306 				&lostfound_str,
307 				lostfound->bi_inum,
308 				&lostfound->bi_dir_offset,
309 				BTREE_UPDATE_internal_snapshot_node|
310 				STR_HASH_must_create) ?:
311 		bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
312 				       BTREE_UPDATE_internal_snapshot_node);
313 err:
314 	bch_err_msg(c, ret, "creating lost+found");
315 	bch2_trans_iter_exit(trans, &lostfound_iter);
316 	return ret;
317 }
318 
inode_should_reattach(struct bch_inode_unpacked * inode)319 static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
320 {
321 	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
322 	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
323 		return false;
324 
325 	/*
326 	 * Subvolume roots are special: older versions of subvolume roots may be
327 	 * disconnected, it's only the newest version that matters.
328 	 *
329 	 * We only keep a single dirent pointing to a subvolume root, i.e.
330 	 * older versions of snapshots will not have a different dirent pointing
331 	 * to the same subvolume root.
332 	 *
333 	 * This is because dirents that point to subvolumes are only visible in
334 	 * the parent subvolume - versioning is not needed - and keeping them
335 	 * around would break fsck, because when we're crossing subvolumes we
336 	 * don't have a consistent snapshot ID to do check the inode <-> dirent
337 	 * relationships.
338 	 *
339 	 * Thus, a subvolume root that's been renamed after a snapshot will have
340 	 * a disconnected older version - that's expected.
341 	 *
342 	 * Note that taking a snapshot always updates the root inode (to update
343 	 * the dirent backpointer), so a subvolume root inode with
344 	 * BCH_INODE_has_child_snapshot is never visible.
345 	 */
346 	if (inode->bi_subvol &&
347 	    (inode->bi_flags & BCH_INODE_has_child_snapshot))
348 		return false;
349 
350 	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
351 }
352 
maybe_delete_dirent(struct btree_trans * trans,struct bpos d_pos,u32 snapshot)353 static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
354 {
355 	struct btree_iter iter;
356 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
357 					SPOS(d_pos.inode, d_pos.offset, snapshot),
358 					BTREE_ITER_intent|
359 					BTREE_ITER_with_updates);
360 	int ret = bkey_err(k);
361 	if (ret)
362 		return ret;
363 
364 	if (bpos_eq(k.k->p, d_pos)) {
365 		/*
366 		 * delet_at() doesn't work because the update path doesn't
367 		 * internally use BTREE_ITER_with_updates yet
368 		 */
369 		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
370 		ret = PTR_ERR_OR_ZERO(k);
371 		if (ret)
372 			goto err;
373 
374 		bkey_init(&k->k);
375 		k->k.type = KEY_TYPE_whiteout;
376 		k->k.p = iter.pos;
377 		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
378 	}
379 err:
380 	bch2_trans_iter_exit(trans, &iter);
381 	return ret;
382 }
383 
reattach_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode)384 static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
385 {
386 	struct bch_fs *c = trans->c;
387 	struct bch_inode_unpacked lostfound;
388 	char name_buf[20];
389 	int ret;
390 
391 	u32 dirent_snapshot = inode->bi_snapshot;
392 	if (inode->bi_subvol) {
393 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
394 
395 		u64 root_inum;
396 		ret = subvol_lookup(trans, inode->bi_parent_subvol,
397 				    &dirent_snapshot, &root_inum);
398 		if (ret)
399 			return ret;
400 
401 		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
402 	} else {
403 		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
404 	}
405 
406 	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
407 	if (ret)
408 		return ret;
409 
410 	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
411 
412 	/* ensure lost+found inode is also present in inode snapshot */
413 	if (!inode->bi_subvol) {
414 		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
415 		lostfound.bi_snapshot = inode->bi_snapshot;
416 	}
417 
418 	ret = __bch2_fsck_write_inode(trans, &lostfound);
419 	if (ret)
420 		return ret;
421 
422 	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
423 	struct qstr name = QSTR(name_buf);
424 
425 	inode->bi_dir = lostfound.bi_inum;
426 
427 	ret = bch2_dirent_create_snapshot(trans,
428 				inode->bi_parent_subvol, lostfound.bi_inum,
429 				dirent_snapshot,
430 				&dir_hash,
431 				inode_d_type(inode),
432 				&name,
433 				inode->bi_subvol ?: inode->bi_inum,
434 				&inode->bi_dir_offset,
435 				BTREE_UPDATE_internal_snapshot_node|
436 				STR_HASH_must_create);
437 	if (ret) {
438 		bch_err_msg(c, ret, "error creating dirent");
439 		return ret;
440 	}
441 
442 	ret = __bch2_fsck_write_inode(trans, inode);
443 	if (ret)
444 		return ret;
445 
446 	/*
447 	 * Fix up inodes in child snapshots: if they should also be reattached
448 	 * update the backpointer field, if they should not be we need to emit
449 	 * whiteouts for the dirent we just created.
450 	 */
451 	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
452 		snapshot_id_list whiteouts_done;
453 		struct btree_iter iter;
454 		struct bkey_s_c k;
455 
456 		darray_init(&whiteouts_done);
457 
458 		for_each_btree_key_reverse_norestart(trans, iter,
459 				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
460 				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
461 			if (k.k->p.offset != inode->bi_inum)
462 				break;
463 
464 			if (!bkey_is_inode(k.k) ||
465 			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
466 			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
467 				continue;
468 
469 			struct bch_inode_unpacked child_inode;
470 			ret = bch2_inode_unpack(k, &child_inode);
471 			if (ret)
472 				break;
473 
474 			if (!inode_should_reattach(&child_inode)) {
475 				ret = maybe_delete_dirent(trans,
476 							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
477 							       dirent_snapshot),
478 							  k.k->p.snapshot);
479 				if (ret)
480 					break;
481 
482 				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
483 				if (ret)
484 					break;
485 			} else {
486 				iter.snapshot = k.k->p.snapshot;
487 				child_inode.bi_dir = inode->bi_dir;
488 				child_inode.bi_dir_offset = inode->bi_dir_offset;
489 
490 				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
491 							     BTREE_UPDATE_internal_snapshot_node);
492 				if (ret)
493 					break;
494 			}
495 		}
496 		darray_exit(&whiteouts_done);
497 		bch2_trans_iter_exit(trans, &iter);
498 	}
499 
500 	return ret;
501 }
502 
dirent_get_by_pos(struct btree_trans * trans,struct btree_iter * iter,struct bpos pos)503 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
504 						struct btree_iter *iter,
505 						struct bpos pos)
506 {
507 	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
508 }
509 
remove_backpointer(struct btree_trans * trans,struct bch_inode_unpacked * inode)510 static int remove_backpointer(struct btree_trans *trans,
511 			      struct bch_inode_unpacked *inode)
512 {
513 	if (!inode->bi_dir)
514 		return 0;
515 
516 	struct bch_fs *c = trans->c;
517 	struct btree_iter iter;
518 	struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
519 				     SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
520 	int ret = bkey_err(d) ?:
521 		  dirent_points_to_inode(c, d, inode) ?:
522 		  bch2_fsck_remove_dirent(trans, d.k->p);
523 	bch2_trans_iter_exit(trans, &iter);
524 	return ret;
525 }
526 
reattach_subvol(struct btree_trans * trans,struct bkey_s_c_subvolume s)527 static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
528 {
529 	struct bch_fs *c = trans->c;
530 
531 	struct bch_inode_unpacked inode;
532 	int ret = bch2_inode_find_by_inum_trans(trans,
533 				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
534 				&inode);
535 	if (ret)
536 		return ret;
537 
538 	ret = remove_backpointer(trans, &inode);
539 	if (!bch2_err_matches(ret, ENOENT))
540 		bch_err_msg(c, ret, "removing dirent");
541 	if (ret)
542 		return ret;
543 
544 	ret = reattach_inode(trans, &inode);
545 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
546 	return ret;
547 }
548 
reconstruct_subvol(struct btree_trans * trans,u32 snapshotid,u32 subvolid,u64 inum)549 static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
550 {
551 	struct bch_fs *c = trans->c;
552 
553 	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
554 		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
555 		return -BCH_ERR_fsck_repair_unimplemented;
556 	}
557 
558 	/*
559 	 * If inum isn't set, that means we're being called from check_dirents,
560 	 * not check_inodes - the root of this subvolume doesn't exist or we
561 	 * would have found it there:
562 	 */
563 	if (!inum) {
564 		struct btree_iter inode_iter = {};
565 		struct bch_inode_unpacked new_inode;
566 		u64 cpu = raw_smp_processor_id();
567 
568 		bch2_inode_init_early(c, &new_inode);
569 		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
570 
571 		new_inode.bi_subvol = subvolid;
572 
573 		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
574 			  bch2_btree_iter_traverse(trans, &inode_iter) ?:
575 			  bch2_inode_write(trans, &inode_iter, &new_inode);
576 		bch2_trans_iter_exit(trans, &inode_iter);
577 		if (ret)
578 			return ret;
579 
580 		inum = new_inode.bi_inum;
581 	}
582 
583 	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
584 
585 	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
586 	int ret = PTR_ERR_OR_ZERO(new_subvol);
587 	if (ret)
588 		return ret;
589 
590 	bkey_subvolume_init(&new_subvol->k_i);
591 	new_subvol->k.p.offset	= subvolid;
592 	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
593 	new_subvol->v.inode	= cpu_to_le64(inum);
594 	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
595 	if (ret)
596 		return ret;
597 
598 	struct btree_iter iter;
599 	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
600 			BTREE_ID_snapshots, POS(0, snapshotid),
601 			0, snapshot);
602 	ret = PTR_ERR_OR_ZERO(s);
603 	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
604 	if (ret)
605 		return ret;
606 
607 	u32 snapshot_tree = le32_to_cpu(s->v.tree);
608 
609 	s->v.subvol = cpu_to_le32(subvolid);
610 	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
611 	bch2_trans_iter_exit(trans, &iter);
612 
613 	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
614 			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
615 			0, snapshot_tree);
616 	ret = PTR_ERR_OR_ZERO(st);
617 	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
618 	if (ret)
619 		return ret;
620 
621 	if (!st->v.master_subvol)
622 		st->v.master_subvol = cpu_to_le32(subvolid);
623 
624 	bch2_trans_iter_exit(trans, &iter);
625 	return 0;
626 }
627 
reconstruct_inode(struct btree_trans * trans,enum btree_id btree,u32 snapshot,u64 inum)628 static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 snapshot, u64 inum)
629 {
630 	struct bch_fs *c = trans->c;
631 	unsigned i_mode = S_IFREG;
632 	u64 i_size = 0;
633 
634 	switch (btree) {
635 	case BTREE_ID_extents: {
636 		struct btree_iter iter = {};
637 
638 		bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
639 		struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
640 		bch2_trans_iter_exit(trans, &iter);
641 		int ret = bkey_err(k);
642 		if (ret)
643 			return ret;
644 
645 		i_size = k.k->p.offset << 9;
646 		break;
647 	}
648 	case BTREE_ID_dirents:
649 		i_mode = S_IFDIR;
650 		break;
651 	case BTREE_ID_xattrs:
652 		break;
653 	default:
654 		BUG();
655 	}
656 
657 	struct bch_inode_unpacked new_inode;
658 	bch2_inode_init_early(c, &new_inode);
659 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
660 	new_inode.bi_size = i_size;
661 	new_inode.bi_inum = inum;
662 	new_inode.bi_snapshot = snapshot;
663 
664 	return __bch2_fsck_write_inode(trans, &new_inode);
665 }
666 
667 struct snapshots_seen {
668 	struct bpos			pos;
669 	snapshot_id_list		ids;
670 };
671 
snapshots_seen_exit(struct snapshots_seen * s)672 static inline void snapshots_seen_exit(struct snapshots_seen *s)
673 {
674 	darray_exit(&s->ids);
675 }
676 
snapshots_seen_init(struct snapshots_seen * s)677 static inline void snapshots_seen_init(struct snapshots_seen *s)
678 {
679 	memset(s, 0, sizeof(*s));
680 }
681 
snapshots_seen_add_inorder(struct bch_fs * c,struct snapshots_seen * s,u32 id)682 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
683 {
684 	u32 *i;
685 	__darray_for_each(s->ids, i) {
686 		if (*i == id)
687 			return 0;
688 		if (*i > id)
689 			break;
690 	}
691 
692 	int ret = darray_insert_item(&s->ids, i - s->ids.data, id);
693 	if (ret)
694 		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
695 			s->ids.size);
696 	return ret;
697 }
698 
snapshots_seen_update(struct bch_fs * c,struct snapshots_seen * s,enum btree_id btree_id,struct bpos pos)699 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
700 				 enum btree_id btree_id, struct bpos pos)
701 {
702 	if (!bkey_eq(s->pos, pos))
703 		s->ids.nr = 0;
704 	s->pos = pos;
705 
706 	return snapshot_list_add_nodup(c, &s->ids, pos.snapshot);
707 }
708 
709 /**
710  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
711  * and @ancestor hasn't been overwritten in @seen
712  *
713  * @c:		filesystem handle
714  * @seen:	list of snapshot ids already seen at current position
715  * @id:		descendent snapshot id
716  * @ancestor:	ancestor snapshot id
717  *
718  * Returns:	whether key in @ancestor snapshot is visible in @id snapshot
719  */
key_visible_in_snapshot(struct bch_fs * c,struct snapshots_seen * seen,u32 id,u32 ancestor)720 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
721 				    u32 id, u32 ancestor)
722 {
723 	ssize_t i;
724 
725 	EBUG_ON(id > ancestor);
726 
727 	/* @ancestor should be the snapshot most recently added to @seen */
728 	EBUG_ON(ancestor != seen->pos.snapshot);
729 	EBUG_ON(ancestor != darray_last(seen->ids));
730 
731 	if (id == ancestor)
732 		return true;
733 
734 	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
735 		return false;
736 
737 	/*
738 	 * We know that @id is a descendant of @ancestor, we're checking if
739 	 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
740 	 * @ascestor and with @id as a descendent.
741 	 *
742 	 * But we already know that we're scanning IDs between @id and @ancestor
743 	 * numerically, since snapshot ID lists are kept sorted, so if we find
744 	 * an id that's an ancestor of @id we're done:
745 	 */
746 
747 	for (i = seen->ids.nr - 2;
748 	     i >= 0 && seen->ids.data[i] >= id;
749 	     --i)
750 		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
751 			return false;
752 
753 	return true;
754 }
755 
756 /**
757  * ref_visible - given a key with snapshot id @src that points to a key with
758  * snapshot id @dst, test whether there is some snapshot in which @dst is
759  * visible.
760  *
761  * @c:		filesystem handle
762  * @s:		list of snapshot IDs already seen at @src
763  * @src:	snapshot ID of src key
764  * @dst:	snapshot ID of dst key
765  * Returns:	true if there is some snapshot in which @dst is visible
766  *
767  * Assumes we're visiting @src keys in natural key order
768  */
ref_visible(struct bch_fs * c,struct snapshots_seen * s,u32 src,u32 dst)769 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
770 			u32 src, u32 dst)
771 {
772 	return dst <= src
773 		? key_visible_in_snapshot(c, s, dst, src)
774 		: bch2_snapshot_is_ancestor(c, src, dst);
775 }
776 
ref_visible2(struct bch_fs * c,u32 src,struct snapshots_seen * src_seen,u32 dst,struct snapshots_seen * dst_seen)777 static int ref_visible2(struct bch_fs *c,
778 			u32 src, struct snapshots_seen *src_seen,
779 			u32 dst, struct snapshots_seen *dst_seen)
780 {
781 	if (dst > src) {
782 		swap(dst, src);
783 		swap(dst_seen, src_seen);
784 	}
785 	return key_visible_in_snapshot(c, src_seen, dst, src);
786 }
787 
788 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)				\
789 	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&	\
790 	     (_i)->snapshot <= (_snapshot); _i++)					\
791 		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
792 
793 struct inode_walker_entry {
794 	struct bch_inode_unpacked inode;
795 	u32			snapshot;
796 	u64			count;
797 	u64			i_size;
798 };
799 
800 struct inode_walker {
801 	bool				first_this_inode;
802 	bool				have_inodes;
803 	bool				recalculate_sums;
804 	struct bpos			last_pos;
805 
806 	DARRAY(struct inode_walker_entry) inodes;
807 	snapshot_id_list		deletes;
808 };
809 
inode_walker_exit(struct inode_walker * w)810 static void inode_walker_exit(struct inode_walker *w)
811 {
812 	darray_exit(&w->inodes);
813 	darray_exit(&w->deletes);
814 }
815 
inode_walker_init(void)816 static struct inode_walker inode_walker_init(void)
817 {
818 	return (struct inode_walker) { 0, };
819 }
820 
add_inode(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c inode)821 static int add_inode(struct bch_fs *c, struct inode_walker *w,
822 		     struct bkey_s_c inode)
823 {
824 	struct bch_inode_unpacked u;
825 
826 	return bch2_inode_unpack(inode, &u) ?:
827 		darray_push(&w->inodes, ((struct inode_walker_entry) {
828 		.inode		= u,
829 		.snapshot	= inode.k->p.snapshot,
830 	}));
831 }
832 
get_inodes_all_snapshots(struct btree_trans * trans,struct inode_walker * w,u64 inum)833 static int get_inodes_all_snapshots(struct btree_trans *trans,
834 				    struct inode_walker *w, u64 inum)
835 {
836 	struct bch_fs *c = trans->c;
837 	struct btree_iter iter;
838 	struct bkey_s_c k;
839 	int ret;
840 
841 	/*
842 	 * We no longer have inodes for w->last_pos; clear this to avoid
843 	 * screwing up check_i_sectors/check_subdir_count if we take a
844 	 * transaction restart here:
845 	 */
846 	w->have_inodes = false;
847 	w->recalculate_sums = false;
848 	w->inodes.nr = 0;
849 
850 	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
851 				     BTREE_ITER_all_snapshots, k, ret) {
852 		if (k.k->p.offset != inum)
853 			break;
854 
855 		if (bkey_is_inode(k.k))
856 			add_inode(c, w, k);
857 	}
858 	bch2_trans_iter_exit(trans, &iter);
859 
860 	if (ret)
861 		return ret;
862 
863 	w->first_this_inode = true;
864 	w->have_inodes = true;
865 	return 0;
866 }
867 
868 static struct inode_walker_entry *
lookup_inode_for_snapshot(struct bch_fs * c,struct inode_walker * w,struct bkey_s_c k)869 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
870 {
871 	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
872 
873 	struct inode_walker_entry *i;
874 	__darray_for_each(w->inodes, i)
875 		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
876 			goto found;
877 
878 	return NULL;
879 found:
880 	BUG_ON(k.k->p.snapshot > i->snapshot);
881 
882 	if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
883 		struct inode_walker_entry new = *i;
884 
885 		new.snapshot	= k.k->p.snapshot;
886 		new.count	= 0;
887 		new.i_size	= 0;
888 
889 		struct printbuf buf = PRINTBUF;
890 		bch2_bkey_val_to_text(&buf, c, k);
891 
892 		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
893 			 "unexpected because we should always update the inode when we update a key in that inode\n"
894 			 "%s",
895 			 w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
896 		printbuf_exit(&buf);
897 
898 		while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
899 			--i;
900 
901 		size_t pos = i - w->inodes.data;
902 		int ret = darray_insert_item(&w->inodes, pos, new);
903 		if (ret)
904 			return ERR_PTR(ret);
905 
906 		i = w->inodes.data + pos;
907 	}
908 
909 	return i;
910 }
911 
walk_inode(struct btree_trans * trans,struct inode_walker * w,struct bkey_s_c k)912 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
913 					     struct inode_walker *w,
914 					     struct bkey_s_c k)
915 {
916 	if (w->last_pos.inode != k.k->p.inode) {
917 		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
918 		if (ret)
919 			return ERR_PTR(ret);
920 	}
921 
922 	w->last_pos = k.k->p;
923 
924 	return lookup_inode_for_snapshot(trans->c, w, k);
925 }
926 
get_visible_inodes(struct btree_trans * trans,struct inode_walker * w,struct snapshots_seen * s,u64 inum)927 static int get_visible_inodes(struct btree_trans *trans,
928 			      struct inode_walker *w,
929 			      struct snapshots_seen *s,
930 			      u64 inum)
931 {
932 	struct bch_fs *c = trans->c;
933 	struct btree_iter iter;
934 	struct bkey_s_c k;
935 	int ret;
936 
937 	w->inodes.nr = 0;
938 	w->deletes.nr = 0;
939 
940 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot),
941 			   BTREE_ITER_all_snapshots, k, ret) {
942 		if (k.k->p.offset != inum)
943 			break;
944 
945 		if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot))
946 			continue;
947 
948 		if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot))
949 			continue;
950 
951 		ret = bkey_is_inode(k.k)
952 			? add_inode(c, w, k)
953 			: snapshot_list_add(c, &w->deletes, k.k->p.snapshot);
954 		if (ret)
955 			break;
956 	}
957 	bch2_trans_iter_exit(trans, &iter);
958 
959 	return ret;
960 }
961 
962 /*
963  * Prefer to delete the first one, since that will be the one at the wrong
964  * offset:
965  * return value: 0 -> delete k1, 1 -> delete k2
966  */
bch2_fsck_update_backpointers(struct btree_trans * trans,struct snapshots_seen * s,const struct bch_hash_desc desc,struct bch_hash_info * hash_info,struct bkey_i * new)967 int bch2_fsck_update_backpointers(struct btree_trans *trans,
968 				  struct snapshots_seen *s,
969 				  const struct bch_hash_desc desc,
970 				  struct bch_hash_info *hash_info,
971 				  struct bkey_i *new)
972 {
973 	if (new->k.type != KEY_TYPE_dirent)
974 		return 0;
975 
976 	struct bkey_i_dirent *d = bkey_i_to_dirent(new);
977 	struct inode_walker target = inode_walker_init();
978 	int ret = 0;
979 
980 	if (d->v.d_type == DT_SUBVOL) {
981 		BUG();
982 	} else {
983 		ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
984 		if (ret)
985 			goto err;
986 
987 		darray_for_each(target.inodes, i) {
988 			i->inode.bi_dir_offset = d->k.p.offset;
989 			ret = __bch2_fsck_write_inode(trans, &i->inode);
990 			if (ret)
991 				goto err;
992 		}
993 	}
994 err:
995 	inode_walker_exit(&target);
996 	return ret;
997 }
998 
inode_get_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bch_inode_unpacked * inode,u32 * snapshot)999 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
1000 					       struct btree_iter *iter,
1001 					       struct bch_inode_unpacked *inode,
1002 					       u32 *snapshot)
1003 {
1004 	if (inode->bi_subvol) {
1005 		u64 inum;
1006 		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
1007 		if (ret)
1008 			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
1009 	}
1010 
1011 	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
1012 }
1013 
check_inode_deleted_list(struct btree_trans * trans,struct bpos p)1014 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
1015 {
1016 	struct btree_iter iter;
1017 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
1018 	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
1019 	bch2_trans_iter_exit(trans, &iter);
1020 	return ret;
1021 }
1022 
check_inode_dirent_inode(struct btree_trans * trans,struct bch_inode_unpacked * inode,bool * write_inode)1023 static int check_inode_dirent_inode(struct btree_trans *trans,
1024 				    struct bch_inode_unpacked *inode,
1025 				    bool *write_inode)
1026 {
1027 	struct bch_fs *c = trans->c;
1028 	struct printbuf buf = PRINTBUF;
1029 
1030 	u32 inode_snapshot = inode->bi_snapshot;
1031 	struct btree_iter dirent_iter = {};
1032 	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
1033 	int ret = bkey_err(d);
1034 	if (ret && !bch2_err_matches(ret, ENOENT))
1035 		return ret;
1036 
1037 	if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
1038 	    inode->bi_subvol &&
1039 	    (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
1040 		/* Older version of a renamed subvolume root: we won't have a
1041 		 * correct dirent for it. That's expected, see
1042 		 * inode_should_reattach().
1043 		 *
1044 		 * We don't clear the backpointer field when doing the rename
1045 		 * because there might be arbitrarily many versions in older
1046 		 * snapshots.
1047 		 */
1048 		inode->bi_dir = 0;
1049 		inode->bi_dir_offset = 0;
1050 		*write_inode = true;
1051 		goto out;
1052 	}
1053 
1054 	if (fsck_err_on(ret,
1055 			trans, inode_points_to_missing_dirent,
1056 			"inode points to missing dirent\n%s",
1057 			(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
1058 	    fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
1059 			trans, inode_points_to_wrong_dirent,
1060 			"%s",
1061 			(printbuf_reset(&buf),
1062 			 dirent_inode_mismatch_msg(&buf, c, d, inode),
1063 			 buf.buf))) {
1064 		/*
1065 		 * We just clear the backpointer fields for now. If we find a
1066 		 * dirent that points to this inode in check_dirents(), we'll
1067 		 * update it then; then when we get to check_path() if the
1068 		 * backpointer is still 0 we'll reattach it.
1069 		 */
1070 		inode->bi_dir = 0;
1071 		inode->bi_dir_offset = 0;
1072 		*write_inode = true;
1073 	}
1074 out:
1075 	ret = 0;
1076 fsck_err:
1077 	bch2_trans_iter_exit(trans, &dirent_iter);
1078 	printbuf_exit(&buf);
1079 	bch_err_fn(c, ret);
1080 	return ret;
1081 }
1082 
get_snapshot_root_inode(struct btree_trans * trans,struct bch_inode_unpacked * root,u64 inum)1083 static int get_snapshot_root_inode(struct btree_trans *trans,
1084 				   struct bch_inode_unpacked *root,
1085 				   u64 inum)
1086 {
1087 	struct btree_iter iter;
1088 	struct bkey_s_c k;
1089 	int ret = 0;
1090 
1091 	for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
1092 					     SPOS(0, inum, U32_MAX),
1093 					     BTREE_ITER_all_snapshots, k, ret) {
1094 		if (k.k->p.offset != inum)
1095 			break;
1096 		if (bkey_is_inode(k.k))
1097 			goto found_root;
1098 	}
1099 	if (ret)
1100 		goto err;
1101 	BUG();
1102 found_root:
1103 	ret = bch2_inode_unpack(k, root);
1104 err:
1105 	bch2_trans_iter_exit(trans, &iter);
1106 	return ret;
1107 }
1108 
check_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_inode_unpacked * snapshot_root,struct snapshots_seen * s)1109 static int check_inode(struct btree_trans *trans,
1110 		       struct btree_iter *iter,
1111 		       struct bkey_s_c k,
1112 		       struct bch_inode_unpacked *snapshot_root,
1113 		       struct snapshots_seen *s)
1114 {
1115 	struct bch_fs *c = trans->c;
1116 	struct printbuf buf = PRINTBUF;
1117 	struct bch_inode_unpacked u;
1118 	bool do_update = false;
1119 	int ret;
1120 
1121 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1122 	if (ret < 0)
1123 		goto err;
1124 	if (ret)
1125 		return 0;
1126 
1127 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1128 	if (ret)
1129 		goto err;
1130 
1131 	if (!bkey_is_inode(k.k))
1132 		return 0;
1133 
1134 	ret = bch2_inode_unpack(k, &u);
1135 	if (ret)
1136 		goto err;
1137 
1138 	if (snapshot_root->bi_inum != u.bi_inum) {
1139 		ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
1140 		if (ret)
1141 			goto err;
1142 	}
1143 
1144 	if (fsck_err_on(u.bi_hash_seed		!= snapshot_root->bi_hash_seed ||
1145 			INODE_STR_HASH(&u)	!= INODE_STR_HASH(snapshot_root),
1146 			trans, inode_snapshot_mismatch,
1147 			"inode hash info in different snapshots don't match")) {
1148 		u.bi_hash_seed = snapshot_root->bi_hash_seed;
1149 		SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
1150 		do_update = true;
1151 	}
1152 
1153 	if (u.bi_dir || u.bi_dir_offset) {
1154 		ret = check_inode_dirent_inode(trans, &u, &do_update);
1155 		if (ret)
1156 			goto err;
1157 	}
1158 
1159 	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
1160 			trans, inode_unlinked_but_has_dirent,
1161 			"inode unlinked but has dirent\n%s",
1162 			(printbuf_reset(&buf),
1163 			 bch2_inode_unpacked_to_text(&buf, &u),
1164 			 buf.buf))) {
1165 		u.bi_flags &= ~BCH_INODE_unlinked;
1166 		do_update = true;
1167 	}
1168 
1169 	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
1170 		/* Check for this early so that check_unreachable_inode() will reattach it */
1171 
1172 		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
1173 		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
1174 			goto err;
1175 
1176 		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
1177 			    "dir unlinked but not empty\n%s",
1178 			    (printbuf_reset(&buf),
1179 			     bch2_inode_unpacked_to_text(&buf, &u),
1180 			     buf.buf));
1181 		u.bi_flags &= ~BCH_INODE_unlinked;
1182 		do_update = true;
1183 		ret = 0;
1184 	}
1185 
1186 	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1187 	if (ret < 0)
1188 		goto err;
1189 
1190 	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
1191 			trans, inode_has_child_snapshots_wrong,
1192 			"inode has_child_snapshots flag wrong (should be %u)\n%s",
1193 			ret,
1194 			(printbuf_reset(&buf),
1195 			 bch2_inode_unpacked_to_text(&buf, &u),
1196 			 buf.buf))) {
1197 		if (ret)
1198 			u.bi_flags |= BCH_INODE_has_child_snapshot;
1199 		else
1200 			u.bi_flags &= ~BCH_INODE_has_child_snapshot;
1201 		do_update = true;
1202 	}
1203 	ret = 0;
1204 
1205 	if ((u.bi_flags & BCH_INODE_unlinked) &&
1206 	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
1207 		if (!test_bit(BCH_FS_started, &c->flags)) {
1208 			/*
1209 			 * If we're not in online fsck, don't delete unlinked
1210 			 * inodes, just make sure they're on the deleted list.
1211 			 *
1212 			 * They might be referred to by a logged operation -
1213 			 * i.e. we might have crashed in the middle of a
1214 			 * truncate on an unlinked but open file - so we want to
1215 			 * let the delete_dead_inodes kill it after resuming
1216 			 * logged ops.
1217 			 */
1218 			ret = check_inode_deleted_list(trans, k.k->p);
1219 			if (ret < 0)
1220 				goto err_noprint;
1221 
1222 			fsck_err_on(!ret,
1223 				    trans, unlinked_inode_not_on_deleted_list,
1224 				    "inode %llu:%u unlinked, but not on deleted list",
1225 				    u.bi_inum, k.k->p.snapshot);
1226 
1227 			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
1228 			if (ret)
1229 				goto err;
1230 		} else {
1231 			ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
1232 			if (ret < 0)
1233 				goto err;
1234 
1235 			if (fsck_err_on(!ret,
1236 					trans, inode_unlinked_and_not_open,
1237 				      "inode %llu:%u unlinked and not open",
1238 				      u.bi_inum, u.bi_snapshot)) {
1239 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
1240 				bch_err_msg(c, ret, "in fsck deleting inode");
1241 				goto err_noprint;
1242 			}
1243 			ret = 0;
1244 		}
1245 	}
1246 
1247 	if (fsck_err_on(u.bi_parent_subvol &&
1248 			(u.bi_subvol == 0 ||
1249 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1250 			trans, inode_bi_parent_nonzero,
1251 			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
1252 			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1253 		u.bi_parent_subvol = 0;
1254 		do_update = true;
1255 	}
1256 
1257 	if (u.bi_subvol) {
1258 		struct bch_subvolume s;
1259 
1260 		ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s);
1261 		if (ret && !bch2_err_matches(ret, ENOENT))
1262 			goto err;
1263 
1264 		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
1265 			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
1266 			goto do_update;
1267 		}
1268 
1269 		if (fsck_err_on(ret,
1270 				trans, inode_bi_subvol_missing,
1271 				"inode %llu:%u bi_subvol points to missing subvolume %u",
1272 				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1273 		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1274 				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1275 							   k.k->p.snapshot),
1276 				trans, inode_bi_subvol_wrong,
1277 				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1278 				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1279 				le64_to_cpu(s.inode),
1280 				le32_to_cpu(s.snapshot))) {
1281 			u.bi_subvol = 0;
1282 			u.bi_parent_subvol = 0;
1283 			do_update = true;
1284 		}
1285 	}
1286 
1287 	if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal),
1288 			trans, inode_journal_seq_in_future,
1289 			"inode journal seq in future (currently at %llu)\n%s",
1290 			journal_cur_seq(&c->journal),
1291 			(printbuf_reset(&buf),
1292 			 bch2_inode_unpacked_to_text(&buf, &u),
1293 			buf.buf))) {
1294 		u.bi_journal_seq = journal_cur_seq(&c->journal);
1295 		do_update = true;
1296 	}
1297 do_update:
1298 	if (do_update) {
1299 		ret = __bch2_fsck_write_inode(trans, &u);
1300 		bch_err_msg(c, ret, "in fsck updating inode");
1301 		if (ret)
1302 			goto err_noprint;
1303 	}
1304 err:
1305 fsck_err:
1306 	bch_err_fn(c, ret);
1307 err_noprint:
1308 	printbuf_exit(&buf);
1309 	return ret;
1310 }
1311 
bch2_check_inodes(struct bch_fs * c)1312 int bch2_check_inodes(struct bch_fs *c)
1313 {
1314 	struct bch_inode_unpacked snapshot_root = {};
1315 	struct snapshots_seen s;
1316 
1317 	snapshots_seen_init(&s);
1318 
1319 	int ret = bch2_trans_run(c,
1320 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1321 				POS_MIN,
1322 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1323 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1324 			check_inode(trans, &iter, k, &snapshot_root, &s)));
1325 
1326 	snapshots_seen_exit(&s);
1327 	bch_err_fn(c, ret);
1328 	return ret;
1329 }
1330 
find_oldest_inode_needs_reattach(struct btree_trans * trans,struct bch_inode_unpacked * inode)1331 static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
1332 					    struct bch_inode_unpacked *inode)
1333 {
1334 	struct bch_fs *c = trans->c;
1335 	struct btree_iter iter;
1336 	struct bkey_s_c k;
1337 	int ret = 0;
1338 
1339 	/*
1340 	 * We look for inodes to reattach in natural key order, leaves first,
1341 	 * but we should do the reattach at the oldest version that needs to be
1342 	 * reattached:
1343 	 */
1344 	for_each_btree_key_norestart(trans, iter,
1345 				     BTREE_ID_inodes,
1346 				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
1347 				     BTREE_ITER_all_snapshots, k, ret) {
1348 		if (k.k->p.offset != inode->bi_inum)
1349 			break;
1350 
1351 		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
1352 			continue;
1353 
1354 		if (!bkey_is_inode(k.k))
1355 			break;
1356 
1357 		struct bch_inode_unpacked parent_inode;
1358 		ret = bch2_inode_unpack(k, &parent_inode);
1359 		if (ret)
1360 			break;
1361 
1362 		if (!inode_should_reattach(&parent_inode))
1363 			break;
1364 
1365 		*inode = parent_inode;
1366 	}
1367 	bch2_trans_iter_exit(trans, &iter);
1368 
1369 	return ret;
1370 }
1371 
check_unreachable_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1372 static int check_unreachable_inode(struct btree_trans *trans,
1373 				   struct btree_iter *iter,
1374 				   struct bkey_s_c k)
1375 {
1376 	struct printbuf buf = PRINTBUF;
1377 	int ret = 0;
1378 
1379 	if (!bkey_is_inode(k.k))
1380 		return 0;
1381 
1382 	struct bch_inode_unpacked inode;
1383 	ret = bch2_inode_unpack(k, &inode);
1384 	if (ret)
1385 		return ret;
1386 
1387 	if (!inode_should_reattach(&inode))
1388 		return 0;
1389 
1390 	ret = find_oldest_inode_needs_reattach(trans, &inode);
1391 	if (ret)
1392 		return ret;
1393 
1394 	if (fsck_err(trans, inode_unreachable,
1395 		     "unreachable inode:\n%s",
1396 		     (bch2_inode_unpacked_to_text(&buf, &inode),
1397 		      buf.buf)))
1398 		ret = reattach_inode(trans, &inode);
1399 fsck_err:
1400 	printbuf_exit(&buf);
1401 	return ret;
1402 }
1403 
1404 /*
1405  * Reattach unreachable (but not unlinked) inodes
1406  *
1407  * Run after check_inodes() and check_dirents(), so we node that inode
1408  * backpointer fields point to valid dirents, and every inode that has a dirent
1409  * that points to it has its backpointer field set - so we're just looking for
1410  * non-unlinked inodes without backpointers:
1411  *
1412  * XXX: this is racy w.r.t. hardlink removal in online fsck
1413  */
bch2_check_unreachable_inodes(struct bch_fs * c)1414 int bch2_check_unreachable_inodes(struct bch_fs *c)
1415 {
1416 	int ret = bch2_trans_run(c,
1417 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1418 				POS_MIN,
1419 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1420 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1421 			check_unreachable_inode(trans, &iter, k)));
1422 	bch_err_fn(c, ret);
1423 	return ret;
1424 }
1425 
btree_matches_i_mode(enum btree_id btree,unsigned mode)1426 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
1427 {
1428 	switch (btree) {
1429 	case BTREE_ID_extents:
1430 		return S_ISREG(mode) || S_ISLNK(mode);
1431 	case BTREE_ID_dirents:
1432 		return S_ISDIR(mode);
1433 	case BTREE_ID_xattrs:
1434 		return true;
1435 	default:
1436 		BUG();
1437 	}
1438 }
1439 
check_key_has_inode(struct btree_trans * trans,struct btree_iter * iter,struct inode_walker * inode,struct inode_walker_entry * i,struct bkey_s_c k)1440 static int check_key_has_inode(struct btree_trans *trans,
1441 			       struct btree_iter *iter,
1442 			       struct inode_walker *inode,
1443 			       struct inode_walker_entry *i,
1444 			       struct bkey_s_c k)
1445 {
1446 	struct bch_fs *c = trans->c;
1447 	struct printbuf buf = PRINTBUF;
1448 	int ret = PTR_ERR_OR_ZERO(i);
1449 	if (ret)
1450 		return ret;
1451 
1452 	if (k.k->type == KEY_TYPE_whiteout)
1453 		goto out;
1454 
1455 	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
1456 		ret =   reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
1457 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1458 		if (ret)
1459 			goto err;
1460 
1461 		inode->last_pos.inode--;
1462 		ret = -BCH_ERR_transaction_restart_nested;
1463 		goto err;
1464 	}
1465 
1466 	if (fsck_err_on(!i,
1467 			trans, key_in_missing_inode,
1468 			"key in missing inode:\n%s",
1469 			(printbuf_reset(&buf),
1470 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1471 		goto delete;
1472 
1473 	if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
1474 			trans, key_in_wrong_inode_type,
1475 			"key for wrong inode mode %o:\n%s",
1476 			i->inode.bi_mode,
1477 			(printbuf_reset(&buf),
1478 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1479 		goto delete;
1480 out:
1481 err:
1482 fsck_err:
1483 	printbuf_exit(&buf);
1484 	bch_err_fn(c, ret);
1485 	return ret;
1486 delete:
1487 	ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
1488 	goto out;
1489 }
1490 
check_i_sectors_notnested(struct btree_trans * trans,struct inode_walker * w)1491 static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
1492 {
1493 	struct bch_fs *c = trans->c;
1494 	int ret = 0;
1495 	s64 count2;
1496 
1497 	darray_for_each(w->inodes, i) {
1498 		if (i->inode.bi_sectors == i->count)
1499 			continue;
1500 
1501 		count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1502 
1503 		if (w->recalculate_sums)
1504 			i->count = count2;
1505 
1506 		if (i->count != count2) {
1507 			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1508 					    w->last_pos.inode, i->snapshot, i->count, count2);
1509 			i->count = count2;
1510 		}
1511 
1512 		if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1513 				trans, inode_i_sectors_wrong,
1514 				"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1515 				w->last_pos.inode, i->snapshot,
1516 				i->inode.bi_sectors, i->count)) {
1517 			i->inode.bi_sectors = i->count;
1518 			ret = bch2_fsck_write_inode(trans, &i->inode);
1519 			if (ret)
1520 				break;
1521 		}
1522 	}
1523 fsck_err:
1524 	bch_err_fn(c, ret);
1525 	return ret;
1526 }
1527 
check_i_sectors(struct btree_trans * trans,struct inode_walker * w)1528 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1529 {
1530 	u32 restart_count = trans->restart_count;
1531 	return check_i_sectors_notnested(trans, w) ?:
1532 		trans_was_restarted(trans, restart_count);
1533 }
1534 
1535 struct extent_end {
1536 	u32			snapshot;
1537 	u64			offset;
1538 	struct snapshots_seen	seen;
1539 };
1540 
1541 struct extent_ends {
1542 	struct bpos			last_pos;
1543 	DARRAY(struct extent_end)	e;
1544 };
1545 
extent_ends_reset(struct extent_ends * extent_ends)1546 static void extent_ends_reset(struct extent_ends *extent_ends)
1547 {
1548 	darray_for_each(extent_ends->e, i)
1549 		snapshots_seen_exit(&i->seen);
1550 	extent_ends->e.nr = 0;
1551 }
1552 
extent_ends_exit(struct extent_ends * extent_ends)1553 static void extent_ends_exit(struct extent_ends *extent_ends)
1554 {
1555 	extent_ends_reset(extent_ends);
1556 	darray_exit(&extent_ends->e);
1557 }
1558 
extent_ends_init(struct extent_ends * extent_ends)1559 static void extent_ends_init(struct extent_ends *extent_ends)
1560 {
1561 	memset(extent_ends, 0, sizeof(*extent_ends));
1562 }
1563 
extent_ends_at(struct bch_fs * c,struct extent_ends * extent_ends,struct snapshots_seen * seen,struct bkey_s_c k)1564 static int extent_ends_at(struct bch_fs *c,
1565 			  struct extent_ends *extent_ends,
1566 			  struct snapshots_seen *seen,
1567 			  struct bkey_s_c k)
1568 {
1569 	struct extent_end *i, n = (struct extent_end) {
1570 		.offset		= k.k->p.offset,
1571 		.snapshot	= k.k->p.snapshot,
1572 		.seen		= *seen,
1573 	};
1574 
1575 	n.seen.ids.data = kmemdup(seen->ids.data,
1576 			      sizeof(seen->ids.data[0]) * seen->ids.size,
1577 			      GFP_KERNEL);
1578 	if (!n.seen.ids.data)
1579 		return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1580 
1581 	__darray_for_each(extent_ends->e, i) {
1582 		if (i->snapshot == k.k->p.snapshot) {
1583 			snapshots_seen_exit(&i->seen);
1584 			*i = n;
1585 			return 0;
1586 		}
1587 
1588 		if (i->snapshot >= k.k->p.snapshot)
1589 			break;
1590 	}
1591 
1592 	return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1593 }
1594 
overlapping_extents_found(struct btree_trans * trans,enum btree_id btree,struct bpos pos1,struct snapshots_seen * pos1_seen,struct bkey pos2,bool * fixed,struct extent_end * extent_end)1595 static int overlapping_extents_found(struct btree_trans *trans,
1596 				     enum btree_id btree,
1597 				     struct bpos pos1, struct snapshots_seen *pos1_seen,
1598 				     struct bkey pos2,
1599 				     bool *fixed,
1600 				     struct extent_end *extent_end)
1601 {
1602 	struct bch_fs *c = trans->c;
1603 	struct printbuf buf = PRINTBUF;
1604 	struct btree_iter iter1, iter2 = {};
1605 	struct bkey_s_c k1, k2;
1606 	int ret;
1607 
1608 	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1609 
1610 	bch2_trans_iter_init(trans, &iter1, btree, pos1,
1611 			     BTREE_ITER_all_snapshots|
1612 			     BTREE_ITER_not_extents);
1613 	k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
1614 	ret = bkey_err(k1);
1615 	if (ret)
1616 		goto err;
1617 
1618 	prt_newline(&buf);
1619 	bch2_bkey_val_to_text(&buf, c, k1);
1620 
1621 	if (!bpos_eq(pos1, k1.k->p)) {
1622 		prt_str(&buf, "\nwanted\n  ");
1623 		bch2_bpos_to_text(&buf, pos1);
1624 		prt_str(&buf, "\n");
1625 		bch2_bkey_to_text(&buf, &pos2);
1626 
1627 		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1628 			__func__, buf.buf);
1629 		ret = -BCH_ERR_internal_fsck_err;
1630 		goto err;
1631 	}
1632 
1633 	bch2_trans_copy_iter(trans, &iter2, &iter1);
1634 
1635 	while (1) {
1636 		bch2_btree_iter_advance(trans, &iter2);
1637 
1638 		k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
1639 		ret = bkey_err(k2);
1640 		if (ret)
1641 			goto err;
1642 
1643 		if (bpos_ge(k2.k->p, pos2.p))
1644 			break;
1645 	}
1646 
1647 	prt_newline(&buf);
1648 	bch2_bkey_val_to_text(&buf, c, k2);
1649 
1650 	if (bpos_gt(k2.k->p, pos2.p) ||
1651 	    pos2.size != k2.k->size) {
1652 		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1653 			__func__, buf.buf);
1654 		ret = -BCH_ERR_internal_fsck_err;
1655 		goto err;
1656 	}
1657 
1658 	prt_printf(&buf, "\noverwriting %s extent",
1659 		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1660 
1661 	if (fsck_err(trans, extent_overlapping,
1662 		     "overlapping extents%s", buf.buf)) {
1663 		struct btree_iter *old_iter = &iter1;
1664 		struct disk_reservation res = { 0 };
1665 
1666 		if (pos1.snapshot < pos2.p.snapshot) {
1667 			old_iter = &iter2;
1668 			swap(k1, k2);
1669 		}
1670 
1671 		trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1672 
1673 		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1674 				BTREE_UPDATE_internal_snapshot_node,
1675 				k1, k2) ?:
1676 			bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1677 		bch2_disk_reservation_put(c, &res);
1678 
1679 		bch_info(c, "repair ret %s", bch2_err_str(ret));
1680 
1681 		if (ret)
1682 			goto err;
1683 
1684 		*fixed = true;
1685 
1686 		if (pos1.snapshot == pos2.p.snapshot) {
1687 			/*
1688 			 * We overwrote the first extent, and did the overwrite
1689 			 * in the same snapshot:
1690 			 */
1691 			extent_end->offset = bkey_start_offset(&pos2);
1692 		} else if (pos1.snapshot > pos2.p.snapshot) {
1693 			/*
1694 			 * We overwrote the first extent in pos2's snapshot:
1695 			 */
1696 			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1697 		} else {
1698 			/*
1699 			 * We overwrote the second extent - restart
1700 			 * check_extent() from the top:
1701 			 */
1702 			ret = -BCH_ERR_transaction_restart_nested;
1703 		}
1704 	}
1705 fsck_err:
1706 err:
1707 	bch2_trans_iter_exit(trans, &iter2);
1708 	bch2_trans_iter_exit(trans, &iter1);
1709 	printbuf_exit(&buf);
1710 	return ret;
1711 }
1712 
check_overlapping_extents(struct btree_trans * trans,struct snapshots_seen * seen,struct extent_ends * extent_ends,struct bkey_s_c k,struct btree_iter * iter,bool * fixed)1713 static int check_overlapping_extents(struct btree_trans *trans,
1714 			      struct snapshots_seen *seen,
1715 			      struct extent_ends *extent_ends,
1716 			      struct bkey_s_c k,
1717 			      struct btree_iter *iter,
1718 			      bool *fixed)
1719 {
1720 	struct bch_fs *c = trans->c;
1721 	int ret = 0;
1722 
1723 	/* transaction restart, running again */
1724 	if (bpos_eq(extent_ends->last_pos, k.k->p))
1725 		return 0;
1726 
1727 	if (extent_ends->last_pos.inode != k.k->p.inode)
1728 		extent_ends_reset(extent_ends);
1729 
1730 	darray_for_each(extent_ends->e, i) {
1731 		if (i->offset <= bkey_start_offset(k.k))
1732 			continue;
1733 
1734 		if (!ref_visible2(c,
1735 				  k.k->p.snapshot, seen,
1736 				  i->snapshot, &i->seen))
1737 			continue;
1738 
1739 		ret = overlapping_extents_found(trans, iter->btree_id,
1740 						SPOS(iter->pos.inode,
1741 						     i->offset,
1742 						     i->snapshot),
1743 						&i->seen,
1744 						*k.k, fixed, i);
1745 		if (ret)
1746 			goto err;
1747 	}
1748 
1749 	extent_ends->last_pos = k.k->p;
1750 err:
1751 	return ret;
1752 }
1753 
check_extent_overbig(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)1754 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1755 				struct bkey_s_c k)
1756 {
1757 	struct bch_fs *c = trans->c;
1758 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1759 	struct bch_extent_crc_unpacked crc;
1760 	const union bch_extent_entry *i;
1761 	unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1762 
1763 	bkey_for_each_crc(k.k, ptrs, crc, i)
1764 		if (crc_is_encoded(crc) &&
1765 		    crc.uncompressed_size > encoded_extent_max_sectors) {
1766 			struct printbuf buf = PRINTBUF;
1767 
1768 			bch2_bkey_val_to_text(&buf, c, k);
1769 			bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1770 			printbuf_exit(&buf);
1771 		}
1772 
1773 	return 0;
1774 }
1775 
check_extent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct inode_walker * inode,struct snapshots_seen * s,struct extent_ends * extent_ends,struct disk_reservation * res)1776 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1777 			struct bkey_s_c k,
1778 			struct inode_walker *inode,
1779 			struct snapshots_seen *s,
1780 			struct extent_ends *extent_ends,
1781 			struct disk_reservation *res)
1782 {
1783 	struct bch_fs *c = trans->c;
1784 	struct printbuf buf = PRINTBUF;
1785 	int ret = 0;
1786 
1787 	ret = bch2_check_key_has_snapshot(trans, iter, k);
1788 	if (ret) {
1789 		ret = ret < 0 ? ret : 0;
1790 		goto out;
1791 	}
1792 
1793 	if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
1794 		ret = check_i_sectors(trans, inode);
1795 		if (ret)
1796 			goto err;
1797 	}
1798 
1799 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1800 	if (ret)
1801 		goto err;
1802 
1803 	struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
1804 	ret = PTR_ERR_OR_ZERO(extent_i);
1805 	if (ret)
1806 		goto err;
1807 
1808 	ret = check_key_has_inode(trans, iter, inode, extent_i, k);
1809 	if (ret)
1810 		goto err;
1811 
1812 	if (k.k->type != KEY_TYPE_whiteout) {
1813 		ret = check_overlapping_extents(trans, s, extent_ends, k, iter,
1814 						&inode->recalculate_sums);
1815 		if (ret)
1816 			goto err;
1817 
1818 		/*
1819 		 * Check inodes in reverse order, from oldest snapshots to
1820 		 * newest, starting from the inode that matches this extent's
1821 		 * snapshot. If we didn't have one, iterate over all inodes:
1822 		 */
1823 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1824 		     inode->inodes.data && i >= inode->inodes.data;
1825 		     --i) {
1826 			if (i->snapshot > k.k->p.snapshot ||
1827 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1828 				continue;
1829 
1830 			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1831 					!bkey_extent_is_reservation(k),
1832 					trans, extent_past_end_of_inode,
1833 					"extent type past end of inode %llu:%u, i_size %llu\n%s",
1834 					i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1835 					(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1836 				struct btree_iter iter2;
1837 
1838 				bch2_trans_copy_iter(trans, &iter2, iter);
1839 				bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
1840 				ret =   bch2_btree_iter_traverse(trans, &iter2) ?:
1841 					bch2_btree_delete_at(trans, &iter2,
1842 						BTREE_UPDATE_internal_snapshot_node);
1843 				bch2_trans_iter_exit(trans, &iter2);
1844 				if (ret)
1845 					goto err;
1846 
1847 				iter->k.type = KEY_TYPE_whiteout;
1848 				break;
1849 			}
1850 		}
1851 	}
1852 
1853 	ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
1854 	if (ret)
1855 		goto err;
1856 
1857 	if (bkey_extent_is_allocation(k.k)) {
1858 		for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
1859 		     inode->inodes.data && i >= inode->inodes.data;
1860 		     --i) {
1861 			if (i->snapshot > k.k->p.snapshot ||
1862 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
1863 				continue;
1864 
1865 			i->count += k.k->size;
1866 		}
1867 	}
1868 
1869 	if (k.k->type != KEY_TYPE_whiteout) {
1870 		ret = extent_ends_at(c, extent_ends, s, k);
1871 		if (ret)
1872 			goto err;
1873 	}
1874 out:
1875 err:
1876 fsck_err:
1877 	printbuf_exit(&buf);
1878 	bch_err_fn(c, ret);
1879 	return ret;
1880 }
1881 
1882 /*
1883  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1884  * that i_size an i_sectors are consistent
1885  */
bch2_check_extents(struct bch_fs * c)1886 int bch2_check_extents(struct bch_fs *c)
1887 {
1888 	struct inode_walker w = inode_walker_init();
1889 	struct snapshots_seen s;
1890 	struct extent_ends extent_ends;
1891 	struct disk_reservation res = { 0 };
1892 
1893 	snapshots_seen_init(&s);
1894 	extent_ends_init(&extent_ends);
1895 
1896 	int ret = bch2_trans_run(c,
1897 		for_each_btree_key(trans, iter, BTREE_ID_extents,
1898 				POS(BCACHEFS_ROOT_INO, 0),
1899 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
1900 			bch2_disk_reservation_put(c, &res);
1901 			check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
1902 			check_extent_overbig(trans, &iter, k);
1903 		})) ?:
1904 		check_i_sectors_notnested(trans, &w));
1905 
1906 	bch2_disk_reservation_put(c, &res);
1907 	extent_ends_exit(&extent_ends);
1908 	inode_walker_exit(&w);
1909 	snapshots_seen_exit(&s);
1910 
1911 	bch_err_fn(c, ret);
1912 	return ret;
1913 }
1914 
bch2_check_indirect_extents(struct bch_fs * c)1915 int bch2_check_indirect_extents(struct bch_fs *c)
1916 {
1917 	struct disk_reservation res = { 0 };
1918 
1919 	int ret = bch2_trans_run(c,
1920 		for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1921 				POS_MIN,
1922 				BTREE_ITER_prefetch, k,
1923 				&res, NULL,
1924 				BCH_TRANS_COMMIT_no_enospc, ({
1925 			bch2_disk_reservation_put(c, &res);
1926 			check_extent_overbig(trans, &iter, k);
1927 		})));
1928 
1929 	bch2_disk_reservation_put(c, &res);
1930 	bch_err_fn(c, ret);
1931 	return ret;
1932 }
1933 
check_subdir_count_notnested(struct btree_trans * trans,struct inode_walker * w)1934 static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
1935 {
1936 	struct bch_fs *c = trans->c;
1937 	int ret = 0;
1938 	s64 count2;
1939 
1940 	darray_for_each(w->inodes, i) {
1941 		if (i->inode.bi_nlink == i->count)
1942 			continue;
1943 
1944 		count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1945 		if (count2 < 0)
1946 			return count2;
1947 
1948 		if (i->count != count2) {
1949 			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
1950 					    w->last_pos.inode, i->snapshot, i->count, count2);
1951 			i->count = count2;
1952 			if (i->inode.bi_nlink == i->count)
1953 				continue;
1954 		}
1955 
1956 		if (fsck_err_on(i->inode.bi_nlink != i->count,
1957 				trans, inode_dir_wrong_nlink,
1958 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1959 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1960 			i->inode.bi_nlink = i->count;
1961 			ret = bch2_fsck_write_inode(trans, &i->inode);
1962 			if (ret)
1963 				break;
1964 		}
1965 	}
1966 fsck_err:
1967 	bch_err_fn(c, ret);
1968 	return ret;
1969 }
1970 
check_subdir_dirents_count(struct btree_trans * trans,struct inode_walker * w)1971 static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
1972 {
1973 	u32 restart_count = trans->restart_count;
1974 	return check_subdir_count_notnested(trans, w) ?:
1975 		trans_was_restarted(trans, restart_count);
1976 }
1977 
1978 /* find a subvolume that's a descendent of @snapshot: */
find_snapshot_subvol(struct btree_trans * trans,u32 snapshot,u32 * subvolid)1979 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1980 {
1981 	struct btree_iter iter;
1982 	struct bkey_s_c k;
1983 	int ret;
1984 
1985 	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1986 		if (k.k->type != KEY_TYPE_subvolume)
1987 			continue;
1988 
1989 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1990 		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1991 			bch2_trans_iter_exit(trans, &iter);
1992 			*subvolid = k.k->p.offset;
1993 			goto found;
1994 		}
1995 	}
1996 	if (!ret)
1997 		ret = -ENOENT;
1998 found:
1999 	bch2_trans_iter_exit(trans, &iter);
2000 	return ret;
2001 }
2002 
2003 noinline_for_stack
check_dirent_to_subvol(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_dirent d)2004 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
2005 				  struct bkey_s_c_dirent d)
2006 {
2007 	struct bch_fs *c = trans->c;
2008 	struct btree_iter subvol_iter = {};
2009 	struct bch_inode_unpacked subvol_root;
2010 	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
2011 	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
2012 	u32 parent_snapshot;
2013 	u32 new_parent_subvol = 0;
2014 	u64 parent_inum;
2015 	struct printbuf buf = PRINTBUF;
2016 	int ret = 0;
2017 
2018 	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
2019 	if (ret && !bch2_err_matches(ret, ENOENT))
2020 		return ret;
2021 
2022 	if (ret ||
2023 	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
2024 		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
2025 		if (ret2 && !bch2_err_matches(ret, ENOENT))
2026 			return ret2;
2027 	}
2028 
2029 	if (ret &&
2030 	    !new_parent_subvol &&
2031 	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
2032 		/*
2033 		 * Couldn't find a subvol for dirent's snapshot - but we lost
2034 		 * subvols, so we need to reconstruct:
2035 		 */
2036 		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
2037 		if (ret)
2038 			return ret;
2039 
2040 		parent_snapshot = d.k->p.snapshot;
2041 	}
2042 
2043 	if (fsck_err_on(ret,
2044 			trans, dirent_to_missing_parent_subvol,
2045 			"dirent parent_subvol points to missing subvolume\n%s",
2046 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
2047 	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
2048 			trans, dirent_not_visible_in_parent_subvol,
2049 			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
2050 			parent_snapshot,
2051 			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
2052 		if (!new_parent_subvol) {
2053 			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
2054 			return -BCH_ERR_fsck_repair_unimplemented;
2055 		}
2056 
2057 		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
2058 		ret = PTR_ERR_OR_ZERO(new_dirent);
2059 		if (ret)
2060 			goto err;
2061 
2062 		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
2063 	}
2064 
2065 	struct bkey_s_c_subvolume s =
2066 		bch2_bkey_get_iter_typed(trans, &subvol_iter,
2067 					 BTREE_ID_subvolumes, POS(0, target_subvol),
2068 					 0, subvolume);
2069 	ret = bkey_err(s.s_c);
2070 	if (ret && !bch2_err_matches(ret, ENOENT))
2071 		return ret;
2072 
2073 	if (ret) {
2074 		if (fsck_err(trans, dirent_to_missing_subvol,
2075 			     "dirent points to missing subvolume\n%s",
2076 			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
2077 			return bch2_fsck_remove_dirent(trans, d.k->p);
2078 		ret = 0;
2079 		goto out;
2080 	}
2081 
2082 	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
2083 			trans, subvol_fs_path_parent_wrong,
2084 			"subvol with wrong fs_path_parent, should be be %u\n%s",
2085 			parent_subvol,
2086 			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
2087 		struct bkey_i_subvolume *n =
2088 			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
2089 		ret = PTR_ERR_OR_ZERO(n);
2090 		if (ret)
2091 			goto err;
2092 
2093 		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
2094 	}
2095 
2096 	u64 target_inum = le64_to_cpu(s.v->inode);
2097 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
2098 
2099 	ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
2100 	if (ret && !bch2_err_matches(ret, ENOENT))
2101 		goto err;
2102 
2103 	if (ret) {
2104 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
2105 		ret = -BCH_ERR_fsck_repair_unimplemented;
2106 		goto err;
2107 	}
2108 
2109 	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
2110 			trans, inode_bi_parent_wrong,
2111 			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
2112 			target_inum,
2113 			subvol_root.bi_parent_subvol, parent_subvol)) {
2114 		subvol_root.bi_parent_subvol = parent_subvol;
2115 		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
2116 		ret = __bch2_fsck_write_inode(trans, &subvol_root);
2117 		if (ret)
2118 			goto err;
2119 	}
2120 
2121 	ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
2122 	if (ret)
2123 		goto err;
2124 out:
2125 err:
2126 fsck_err:
2127 	bch2_trans_iter_exit(trans, &subvol_iter);
2128 	printbuf_exit(&buf);
2129 	return ret;
2130 }
2131 
check_dirent(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * dir,struct inode_walker * target,struct snapshots_seen * s)2132 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
2133 			struct bkey_s_c k,
2134 			struct bch_hash_info *hash_info,
2135 			struct inode_walker *dir,
2136 			struct inode_walker *target,
2137 			struct snapshots_seen *s)
2138 {
2139 	struct bch_fs *c = trans->c;
2140 	struct inode_walker_entry *i;
2141 	struct printbuf buf = PRINTBUF;
2142 	int ret = 0;
2143 
2144 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2145 	if (ret) {
2146 		ret = ret < 0 ? ret : 0;
2147 		goto out;
2148 	}
2149 
2150 	ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
2151 	if (ret)
2152 		goto err;
2153 
2154 	if (k.k->type == KEY_TYPE_whiteout)
2155 		goto out;
2156 
2157 	if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
2158 		ret = check_subdir_dirents_count(trans, dir);
2159 		if (ret)
2160 			goto err;
2161 	}
2162 
2163 	i = walk_inode(trans, dir, k);
2164 	ret = PTR_ERR_OR_ZERO(i);
2165 	if (ret < 0)
2166 		goto err;
2167 
2168 	ret = check_key_has_inode(trans, iter, dir, i, k);
2169 	if (ret)
2170 		goto err;
2171 
2172 	if (!i)
2173 		goto out;
2174 
2175 	if (dir->first_this_inode)
2176 		*hash_info = bch2_hash_info_init(c, &i->inode);
2177 	dir->first_this_inode = false;
2178 
2179 	ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
2180 	if (ret < 0)
2181 		goto err;
2182 	if (ret) {
2183 		/* dirent has been deleted */
2184 		ret = 0;
2185 		goto out;
2186 	}
2187 
2188 	if (k.k->type != KEY_TYPE_dirent)
2189 		goto out;
2190 
2191 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2192 
2193 	/* check casefold */
2194 	if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
2195 			trans, dirent_casefold_mismatch,
2196 			"dirent casefold does not match dir casefold\n%s",
2197 			(printbuf_reset(&buf),
2198 			 bch2_bkey_val_to_text(&buf, c, k),
2199 			 buf.buf))) {
2200 		struct qstr name = bch2_dirent_get_name(d);
2201 		u32 subvol = d.v->d_type == DT_SUBVOL
2202 			? d.v->d_parent_subvol
2203 			: 0;
2204 		u64 target = d.v->d_type == DT_SUBVOL
2205 			? d.v->d_child_subvol
2206 			: d.v->d_inum;
2207 		u64 dir_offset;
2208 
2209 		ret =   bch2_hash_delete_at(trans,
2210 					    bch2_dirent_hash_desc, hash_info, iter,
2211 					    BTREE_UPDATE_internal_snapshot_node) ?:
2212 			bch2_dirent_create_snapshot(trans, subvol,
2213 						    d.k->p.inode, d.k->p.snapshot,
2214 						    hash_info,
2215 						    d.v->d_type,
2216 						    &name,
2217 						    target,
2218 						    &dir_offset,
2219 						    BTREE_ITER_with_updates|
2220 						    BTREE_UPDATE_internal_snapshot_node|
2221 						    STR_HASH_must_create) ?:
2222 			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2223 
2224 		/* might need another check_dirents pass */
2225 		goto out;
2226 	}
2227 
2228 	if (d.v->d_type == DT_SUBVOL) {
2229 		ret = check_dirent_to_subvol(trans, iter, d);
2230 		if (ret)
2231 			goto err;
2232 	} else {
2233 		ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
2234 		if (ret)
2235 			goto err;
2236 
2237 		if (fsck_err_on(!target->inodes.nr,
2238 				trans, dirent_to_missing_inode,
2239 				"dirent points to missing inode:\n%s",
2240 				(printbuf_reset(&buf),
2241 				 bch2_bkey_val_to_text(&buf, c, k),
2242 				 buf.buf))) {
2243 			ret = bch2_fsck_remove_dirent(trans, d.k->p);
2244 			if (ret)
2245 				goto err;
2246 		}
2247 
2248 		darray_for_each(target->inodes, i) {
2249 			ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
2250 			if (ret)
2251 				goto err;
2252 		}
2253 
2254 		darray_for_each(target->deletes, i)
2255 			if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i),
2256 					trans, dirent_to_overwritten_inode,
2257 					"dirent points to inode overwritten in snapshot %u:\n%s",
2258 					*i,
2259 					(printbuf_reset(&buf),
2260 					 bch2_bkey_val_to_text(&buf, c, k),
2261 					 buf.buf))) {
2262 				struct btree_iter delete_iter;
2263 				bch2_trans_iter_init(trans, &delete_iter,
2264 						     BTREE_ID_dirents,
2265 						     SPOS(k.k->p.inode, k.k->p.offset, *i),
2266 						     BTREE_ITER_intent);
2267 				ret =   bch2_btree_iter_traverse(trans, &delete_iter) ?:
2268 					bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
2269 							  hash_info,
2270 							  &delete_iter,
2271 							  BTREE_UPDATE_internal_snapshot_node);
2272 				bch2_trans_iter_exit(trans, &delete_iter);
2273 				if (ret)
2274 					goto err;
2275 
2276 			}
2277 	}
2278 
2279 	ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
2280 	if (ret)
2281 		goto err;
2282 
2283 	for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
2284 		if (d.v->d_type == DT_DIR)
2285 			i->count++;
2286 		i->i_size += bkey_bytes(d.k);
2287 	}
2288 out:
2289 err:
2290 fsck_err:
2291 	printbuf_exit(&buf);
2292 	bch_err_fn(c, ret);
2293 	return ret;
2294 }
2295 
2296 /*
2297  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
2298  * validate d_type
2299  */
bch2_check_dirents(struct bch_fs * c)2300 int bch2_check_dirents(struct bch_fs *c)
2301 {
2302 	struct inode_walker dir = inode_walker_init();
2303 	struct inode_walker target = inode_walker_init();
2304 	struct snapshots_seen s;
2305 	struct bch_hash_info hash_info;
2306 
2307 	snapshots_seen_init(&s);
2308 
2309 	int ret = bch2_trans_run(c,
2310 		for_each_btree_key(trans, iter, BTREE_ID_dirents,
2311 				POS(BCACHEFS_ROOT_INO, 0),
2312 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2313 			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
2314 		check_subdir_count_notnested(trans, &dir));
2315 
2316 	snapshots_seen_exit(&s);
2317 	inode_walker_exit(&dir);
2318 	inode_walker_exit(&target);
2319 	bch_err_fn(c, ret);
2320 	return ret;
2321 }
2322 
check_xattr(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct bch_hash_info * hash_info,struct inode_walker * inode)2323 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
2324 		       struct bkey_s_c k,
2325 		       struct bch_hash_info *hash_info,
2326 		       struct inode_walker *inode)
2327 {
2328 	struct bch_fs *c = trans->c;
2329 	struct inode_walker_entry *i;
2330 	int ret;
2331 
2332 	ret = bch2_check_key_has_snapshot(trans, iter, k);
2333 	if (ret < 0)
2334 		return ret;
2335 	if (ret)
2336 		return 0;
2337 
2338 	i = walk_inode(trans, inode, k);
2339 	ret = PTR_ERR_OR_ZERO(i);
2340 	if (ret)
2341 		return ret;
2342 
2343 	ret = check_key_has_inode(trans, iter, inode, i, k);
2344 	if (ret)
2345 		return ret;
2346 
2347 	if (!i)
2348 		return 0;
2349 
2350 	if (inode->first_this_inode)
2351 		*hash_info = bch2_hash_info_init(c, &i->inode);
2352 	inode->first_this_inode = false;
2353 
2354 	ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
2355 	bch_err_fn(c, ret);
2356 	return ret;
2357 }
2358 
2359 /*
2360  * Walk xattrs: verify that they all have a corresponding inode
2361  */
bch2_check_xattrs(struct bch_fs * c)2362 int bch2_check_xattrs(struct bch_fs *c)
2363 {
2364 	struct inode_walker inode = inode_walker_init();
2365 	struct bch_hash_info hash_info;
2366 	int ret = 0;
2367 
2368 	ret = bch2_trans_run(c,
2369 		for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2370 			POS(BCACHEFS_ROOT_INO, 0),
2371 			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
2372 			k,
2373 			NULL, NULL,
2374 			BCH_TRANS_COMMIT_no_enospc,
2375 		check_xattr(trans, &iter, k, &hash_info, &inode)));
2376 
2377 	inode_walker_exit(&inode);
2378 	bch_err_fn(c, ret);
2379 	return ret;
2380 }
2381 
check_root_trans(struct btree_trans * trans)2382 static int check_root_trans(struct btree_trans *trans)
2383 {
2384 	struct bch_fs *c = trans->c;
2385 	struct bch_inode_unpacked root_inode;
2386 	u32 snapshot;
2387 	u64 inum;
2388 	int ret;
2389 
2390 	ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2391 	if (ret && !bch2_err_matches(ret, ENOENT))
2392 		return ret;
2393 
2394 	if (mustfix_fsck_err_on(ret, trans, root_subvol_missing,
2395 				"root subvol missing")) {
2396 		struct bkey_i_subvolume *root_subvol =
2397 			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
2398 		ret = PTR_ERR_OR_ZERO(root_subvol);
2399 		if (ret)
2400 			goto err;
2401 
2402 		snapshot	= U32_MAX;
2403 		inum		= BCACHEFS_ROOT_INO;
2404 
2405 		bkey_subvolume_init(&root_subvol->k_i);
2406 		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
2407 		root_subvol->v.flags	= 0;
2408 		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
2409 		root_subvol->v.inode	= cpu_to_le64(inum);
2410 		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
2411 		bch_err_msg(c, ret, "writing root subvol");
2412 		if (ret)
2413 			goto err;
2414 	}
2415 
2416 	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
2417 	if (ret && !bch2_err_matches(ret, ENOENT))
2418 		return ret;
2419 
2420 	if (mustfix_fsck_err_on(ret,
2421 				trans, root_dir_missing,
2422 				"root directory missing") ||
2423 	    mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2424 				trans, root_inode_not_dir,
2425 				"root inode not a directory")) {
2426 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2427 				0, NULL);
2428 		root_inode.bi_inum = inum;
2429 		root_inode.bi_snapshot = snapshot;
2430 
2431 		ret = __bch2_fsck_write_inode(trans, &root_inode);
2432 		bch_err_msg(c, ret, "writing root inode");
2433 	}
2434 err:
2435 fsck_err:
2436 	return ret;
2437 }
2438 
2439 /* Get root directory, create if it doesn't exist: */
bch2_check_root(struct bch_fs * c)2440 int bch2_check_root(struct bch_fs *c)
2441 {
2442 	int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2443 		check_root_trans(trans));
2444 	bch_err_fn(c, ret);
2445 	return ret;
2446 }
2447 
2448 typedef DARRAY(u32) darray_u32;
2449 
darray_u32_has(darray_u32 * d,u32 v)2450 static bool darray_u32_has(darray_u32 *d, u32 v)
2451 {
2452 	darray_for_each(*d, i)
2453 		if (*i == v)
2454 			return true;
2455 	return false;
2456 }
2457 
check_subvol_path(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2458 static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
2459 {
2460 	struct bch_fs *c = trans->c;
2461 	struct btree_iter parent_iter = {};
2462 	darray_u32 subvol_path = {};
2463 	struct printbuf buf = PRINTBUF;
2464 	int ret = 0;
2465 
2466 	if (k.k->type != KEY_TYPE_subvolume)
2467 		return 0;
2468 
2469 	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
2470 		ret = darray_push(&subvol_path, k.k->p.offset);
2471 		if (ret)
2472 			goto err;
2473 
2474 		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
2475 
2476 		struct bch_inode_unpacked subvol_root;
2477 		ret = bch2_inode_find_by_inum_trans(trans,
2478 					(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
2479 					&subvol_root);
2480 		if (ret)
2481 			break;
2482 
2483 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
2484 
2485 		if (darray_u32_has(&subvol_path, parent)) {
2486 			if (fsck_err(trans, subvol_loop, "subvolume loop"))
2487 				ret = reattach_subvol(trans, s);
2488 			break;
2489 		}
2490 
2491 		bch2_trans_iter_exit(trans, &parent_iter);
2492 		bch2_trans_iter_init(trans, &parent_iter,
2493 				     BTREE_ID_subvolumes, POS(0, parent), 0);
2494 		k = bch2_btree_iter_peek_slot(trans, &parent_iter);
2495 		ret = bkey_err(k);
2496 		if (ret)
2497 			goto err;
2498 
2499 		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
2500 				trans, subvol_unreachable,
2501 				"unreachable subvolume %s",
2502 				(bch2_bkey_val_to_text(&buf, c, s.s_c),
2503 				 buf.buf))) {
2504 			ret = reattach_subvol(trans, s);
2505 			break;
2506 		}
2507 	}
2508 fsck_err:
2509 err:
2510 	printbuf_exit(&buf);
2511 	darray_exit(&subvol_path);
2512 	bch2_trans_iter_exit(trans, &parent_iter);
2513 	return ret;
2514 }
2515 
bch2_check_subvolume_structure(struct bch_fs * c)2516 int bch2_check_subvolume_structure(struct bch_fs *c)
2517 {
2518 	int ret = bch2_trans_run(c,
2519 		for_each_btree_key_commit(trans, iter,
2520 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k,
2521 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2522 			check_subvol_path(trans, &iter, k)));
2523 	bch_err_fn(c, ret);
2524 	return ret;
2525 }
2526 
2527 struct pathbuf_entry {
2528 	u64	inum;
2529 	u32	snapshot;
2530 };
2531 
2532 typedef DARRAY(struct pathbuf_entry) pathbuf;
2533 
bch2_bi_depth_renumber_one(struct btree_trans * trans,struct pathbuf_entry * p,u32 new_depth)2534 static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
2535 				      u32 new_depth)
2536 {
2537 	struct btree_iter iter;
2538 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
2539 					       SPOS(0, p->inum, p->snapshot), 0);
2540 
2541 	struct bch_inode_unpacked inode;
2542 	int ret = bkey_err(k) ?:
2543 		!bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode
2544 		: bch2_inode_unpack(k, &inode);
2545 	if (ret)
2546 		goto err;
2547 
2548 	if (inode.bi_depth != new_depth) {
2549 		inode.bi_depth = new_depth;
2550 		ret = __bch2_fsck_write_inode(trans, &inode) ?:
2551 			bch2_trans_commit(trans, NULL, NULL, 0);
2552 	}
2553 err:
2554 	bch2_trans_iter_exit(trans, &iter);
2555 	return ret;
2556 }
2557 
bch2_bi_depth_renumber(struct btree_trans * trans,pathbuf * path,u32 new_bi_depth)2558 static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
2559 {
2560 	u32 restart_count = trans->restart_count;
2561 	int ret = 0;
2562 
2563 	darray_for_each_reverse(*path, i) {
2564 		ret = nested_lockrestart_do(trans,
2565 				bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
2566 		bch_err_fn(trans->c, ret);
2567 		if (ret)
2568 			break;
2569 
2570 		new_bi_depth++;
2571 	}
2572 
2573 	return ret ?: trans_was_restarted(trans, restart_count);
2574 }
2575 
path_is_dup(pathbuf * p,u64 inum,u32 snapshot)2576 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2577 {
2578 	darray_for_each(*p, i)
2579 		if (i->inum	== inum &&
2580 		    i->snapshot	== snapshot)
2581 			return true;
2582 	return false;
2583 }
2584 
check_path_loop(struct btree_trans * trans,struct bkey_s_c inode_k)2585 static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
2586 {
2587 	struct bch_fs *c = trans->c;
2588 	struct btree_iter inode_iter = {};
2589 	pathbuf path = {};
2590 	struct printbuf buf = PRINTBUF;
2591 	u32 snapshot = inode_k.k->p.snapshot;
2592 	bool redo_bi_depth = false;
2593 	u32 min_bi_depth = U32_MAX;
2594 	int ret = 0;
2595 
2596 	struct bch_inode_unpacked inode;
2597 	ret = bch2_inode_unpack(inode_k, &inode);
2598 	if (ret)
2599 		return ret;
2600 
2601 	while (!inode.bi_subvol) {
2602 		struct btree_iter dirent_iter;
2603 		struct bkey_s_c_dirent d;
2604 		u32 parent_snapshot = snapshot;
2605 
2606 		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2607 		ret = bkey_err(d.s_c);
2608 		if (ret && !bch2_err_matches(ret, ENOENT))
2609 			goto out;
2610 
2611 		if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
2612 			bch2_trans_iter_exit(trans, &dirent_iter);
2613 
2614 		if (bch2_err_matches(ret, ENOENT)) {
2615 			printbuf_reset(&buf);
2616 			bch2_bkey_val_to_text(&buf, c, inode_k);
2617 			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
2618 				bch2_err_str(ret), buf.buf);
2619 			goto out;
2620 		}
2621 
2622 		bch2_trans_iter_exit(trans, &dirent_iter);
2623 
2624 		ret = darray_push(&path, ((struct pathbuf_entry) {
2625 			.inum		= inode.bi_inum,
2626 			.snapshot	= snapshot,
2627 		}));
2628 		if (ret)
2629 			return ret;
2630 
2631 		snapshot = parent_snapshot;
2632 
2633 		bch2_trans_iter_exit(trans, &inode_iter);
2634 		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2635 					     SPOS(0, inode.bi_dir, snapshot), 0);
2636 
2637 		struct bch_inode_unpacked parent_inode;
2638 		ret = bkey_err(inode_k) ?:
2639 			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2640 			: bch2_inode_unpack(inode_k, &parent_inode);
2641 		if (ret) {
2642 			/* Should have been caught in dirents pass */
2643 			bch_err_msg(c, ret, "error looking up parent directory");
2644 			goto out;
2645 		}
2646 
2647 		min_bi_depth = parent_inode.bi_depth;
2648 
2649 		if (parent_inode.bi_depth < inode.bi_depth &&
2650 		    min_bi_depth < U16_MAX)
2651 			break;
2652 
2653 		inode = parent_inode;
2654 		snapshot = inode_k.k->p.snapshot;
2655 		redo_bi_depth = true;
2656 
2657 		if (path_is_dup(&path, inode.bi_inum, snapshot)) {
2658 			/* XXX print path */
2659 			bch_err(c, "directory structure loop");
2660 
2661 			darray_for_each(path, i)
2662 				pr_err("%llu:%u", i->inum, i->snapshot);
2663 			pr_err("%llu:%u", inode.bi_inum, snapshot);
2664 
2665 			if (fsck_err(trans, dir_loop, "directory structure loop")) {
2666 				ret = remove_backpointer(trans, &inode);
2667 				bch_err_msg(c, ret, "removing dirent");
2668 				if (ret)
2669 					break;
2670 
2671 				ret = reattach_inode(trans, &inode);
2672 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2673 			}
2674 
2675 			goto out;
2676 		}
2677 	}
2678 
2679 	if (inode.bi_subvol)
2680 		min_bi_depth = 0;
2681 
2682 	if (redo_bi_depth)
2683 		ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
2684 out:
2685 fsck_err:
2686 	bch2_trans_iter_exit(trans, &inode_iter);
2687 	darray_exit(&path);
2688 	printbuf_exit(&buf);
2689 	bch_err_fn(c, ret);
2690 	return ret;
2691 }
2692 
2693 /*
2694  * Check for loops in the directory structure: all other connectivity issues
2695  * have been fixed by prior passes
2696  */
bch2_check_directory_structure(struct bch_fs * c)2697 int bch2_check_directory_structure(struct bch_fs *c)
2698 {
2699 	int ret = bch2_trans_run(c,
2700 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2701 					  BTREE_ITER_intent|
2702 					  BTREE_ITER_prefetch|
2703 					  BTREE_ITER_all_snapshots, k,
2704 					  NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2705 			if (!S_ISDIR(bkey_inode_mode(k)))
2706 				continue;
2707 
2708 			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2709 				continue;
2710 
2711 			check_path_loop(trans, k);
2712 		})));
2713 
2714 	bch_err_fn(c, ret);
2715 	return ret;
2716 }
2717 
2718 struct nlink_table {
2719 	size_t		nr;
2720 	size_t		size;
2721 
2722 	struct nlink {
2723 		u64	inum;
2724 		u32	snapshot;
2725 		u32	count;
2726 	}		*d;
2727 };
2728 
add_nlink(struct bch_fs * c,struct nlink_table * t,u64 inum,u32 snapshot)2729 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2730 		     u64 inum, u32 snapshot)
2731 {
2732 	if (t->nr == t->size) {
2733 		size_t new_size = max_t(size_t, 128UL, t->size * 2);
2734 		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2735 
2736 		if (!d) {
2737 			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2738 				new_size);
2739 			return -BCH_ERR_ENOMEM_fsck_add_nlink;
2740 		}
2741 
2742 		if (t->d)
2743 			memcpy(d, t->d, t->size * sizeof(t->d[0]));
2744 		kvfree(t->d);
2745 
2746 		t->d = d;
2747 		t->size = new_size;
2748 	}
2749 
2750 
2751 	t->d[t->nr++] = (struct nlink) {
2752 		.inum		= inum,
2753 		.snapshot	= snapshot,
2754 	};
2755 
2756 	return 0;
2757 }
2758 
nlink_cmp(const void * _l,const void * _r)2759 static int nlink_cmp(const void *_l, const void *_r)
2760 {
2761 	const struct nlink *l = _l;
2762 	const struct nlink *r = _r;
2763 
2764 	return cmp_int(l->inum, r->inum);
2765 }
2766 
inc_link(struct bch_fs * c,struct snapshots_seen * s,struct nlink_table * links,u64 range_start,u64 range_end,u64 inum,u32 snapshot)2767 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2768 		     struct nlink_table *links,
2769 		     u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2770 {
2771 	struct nlink *link, key = {
2772 		.inum = inum, .snapshot = U32_MAX,
2773 	};
2774 
2775 	if (inum < range_start || inum >= range_end)
2776 		return;
2777 
2778 	link = __inline_bsearch(&key, links->d, links->nr,
2779 				sizeof(links->d[0]), nlink_cmp);
2780 	if (!link)
2781 		return;
2782 
2783 	while (link > links->d && link[0].inum == link[-1].inum)
2784 		--link;
2785 
2786 	for (; link < links->d + links->nr && link->inum == inum; link++)
2787 		if (ref_visible(c, s, snapshot, link->snapshot)) {
2788 			link->count++;
2789 			if (link->snapshot >= snapshot)
2790 				break;
2791 		}
2792 }
2793 
2794 noinline_for_stack
check_nlinks_find_hardlinks(struct bch_fs * c,struct nlink_table * t,u64 start,u64 * end)2795 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2796 				       struct nlink_table *t,
2797 				       u64 start, u64 *end)
2798 {
2799 	int ret = bch2_trans_run(c,
2800 		for_each_btree_key(trans, iter, BTREE_ID_inodes,
2801 				   POS(0, start),
2802 				   BTREE_ITER_intent|
2803 				   BTREE_ITER_prefetch|
2804 				   BTREE_ITER_all_snapshots, k, ({
2805 			if (!bkey_is_inode(k.k))
2806 				continue;
2807 
2808 			/* Should never fail, checked by bch2_inode_invalid: */
2809 			struct bch_inode_unpacked u;
2810 			_ret3 = bch2_inode_unpack(k, &u);
2811 			if (_ret3)
2812 				break;
2813 
2814 			/*
2815 			 * Backpointer and directory structure checks are sufficient for
2816 			 * directories, since they can't have hardlinks:
2817 			 */
2818 			if (S_ISDIR(u.bi_mode))
2819 				continue;
2820 
2821 			/*
2822 			 * Previous passes ensured that bi_nlink is nonzero if
2823 			 * it had multiple hardlinks:
2824 			 */
2825 			if (!u.bi_nlink)
2826 				continue;
2827 
2828 			ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2829 			if (ret) {
2830 				*end = k.k->p.offset;
2831 				ret = 0;
2832 				break;
2833 			}
2834 			0;
2835 		})));
2836 
2837 	bch_err_fn(c, ret);
2838 	return ret;
2839 }
2840 
2841 noinline_for_stack
check_nlinks_walk_dirents(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2842 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2843 				     u64 range_start, u64 range_end)
2844 {
2845 	struct snapshots_seen s;
2846 
2847 	snapshots_seen_init(&s);
2848 
2849 	int ret = bch2_trans_run(c,
2850 		for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2851 				   BTREE_ITER_intent|
2852 				   BTREE_ITER_prefetch|
2853 				   BTREE_ITER_all_snapshots, k, ({
2854 			ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2855 			if (ret)
2856 				break;
2857 
2858 			if (k.k->type == KEY_TYPE_dirent) {
2859 				struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2860 
2861 				if (d.v->d_type != DT_DIR &&
2862 				    d.v->d_type != DT_SUBVOL)
2863 					inc_link(c, &s, links, range_start, range_end,
2864 						 le64_to_cpu(d.v->d_inum), d.k->p.snapshot);
2865 			}
2866 			0;
2867 		})));
2868 
2869 	snapshots_seen_exit(&s);
2870 
2871 	bch_err_fn(c, ret);
2872 	return ret;
2873 }
2874 
check_nlinks_update_inode(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k,struct nlink_table * links,size_t * idx,u64 range_end)2875 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2876 				     struct bkey_s_c k,
2877 				     struct nlink_table *links,
2878 				     size_t *idx, u64 range_end)
2879 {
2880 	struct bch_inode_unpacked u;
2881 	struct nlink *link = &links->d[*idx];
2882 	int ret = 0;
2883 
2884 	if (k.k->p.offset >= range_end)
2885 		return 1;
2886 
2887 	if (!bkey_is_inode(k.k))
2888 		return 0;
2889 
2890 	ret = bch2_inode_unpack(k, &u);
2891 	if (ret)
2892 		return ret;
2893 
2894 	if (S_ISDIR(u.bi_mode))
2895 		return 0;
2896 
2897 	if (!u.bi_nlink)
2898 		return 0;
2899 
2900 	while ((cmp_int(link->inum, k.k->p.offset) ?:
2901 		cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2902 		BUG_ON(*idx == links->nr);
2903 		link = &links->d[++*idx];
2904 	}
2905 
2906 	if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2907 			trans, inode_wrong_nlink,
2908 			"inode %llu type %s has wrong i_nlink (%u, should be %u)",
2909 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2910 			bch2_inode_nlink_get(&u), link->count)) {
2911 		bch2_inode_nlink_set(&u, link->count);
2912 		ret = __bch2_fsck_write_inode(trans, &u);
2913 	}
2914 fsck_err:
2915 	return ret;
2916 }
2917 
2918 noinline_for_stack
check_nlinks_update_hardlinks(struct bch_fs * c,struct nlink_table * links,u64 range_start,u64 range_end)2919 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2920 			       struct nlink_table *links,
2921 			       u64 range_start, u64 range_end)
2922 {
2923 	size_t idx = 0;
2924 
2925 	int ret = bch2_trans_run(c,
2926 		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2927 				POS(0, range_start),
2928 				BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
2929 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2930 			check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2931 	if (ret < 0) {
2932 		bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2933 		return ret;
2934 	}
2935 
2936 	return 0;
2937 }
2938 
bch2_check_nlinks(struct bch_fs * c)2939 int bch2_check_nlinks(struct bch_fs *c)
2940 {
2941 	struct nlink_table links = { 0 };
2942 	u64 this_iter_range_start, next_iter_range_start = 0;
2943 	int ret = 0;
2944 
2945 	do {
2946 		this_iter_range_start = next_iter_range_start;
2947 		next_iter_range_start = U64_MAX;
2948 
2949 		ret = check_nlinks_find_hardlinks(c, &links,
2950 						  this_iter_range_start,
2951 						  &next_iter_range_start);
2952 
2953 		ret = check_nlinks_walk_dirents(c, &links,
2954 					  this_iter_range_start,
2955 					  next_iter_range_start);
2956 		if (ret)
2957 			break;
2958 
2959 		ret = check_nlinks_update_hardlinks(c, &links,
2960 					 this_iter_range_start,
2961 					 next_iter_range_start);
2962 		if (ret)
2963 			break;
2964 
2965 		links.nr = 0;
2966 	} while (next_iter_range_start != U64_MAX);
2967 
2968 	kvfree(links.d);
2969 	bch_err_fn(c, ret);
2970 	return ret;
2971 }
2972 
fix_reflink_p_key(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)2973 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2974 			     struct bkey_s_c k)
2975 {
2976 	struct bkey_s_c_reflink_p p;
2977 	struct bkey_i_reflink_p *u;
2978 
2979 	if (k.k->type != KEY_TYPE_reflink_p)
2980 		return 0;
2981 
2982 	p = bkey_s_c_to_reflink_p(k);
2983 
2984 	if (!p.v->front_pad && !p.v->back_pad)
2985 		return 0;
2986 
2987 	u = bch2_trans_kmalloc(trans, sizeof(*u));
2988 	int ret = PTR_ERR_OR_ZERO(u);
2989 	if (ret)
2990 		return ret;
2991 
2992 	bkey_reassemble(&u->k_i, k);
2993 	u->v.front_pad	= 0;
2994 	u->v.back_pad	= 0;
2995 
2996 	return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_norun);
2997 }
2998 
bch2_fix_reflink_p(struct bch_fs * c)2999 int bch2_fix_reflink_p(struct bch_fs *c)
3000 {
3001 	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
3002 		return 0;
3003 
3004 	int ret = bch2_trans_run(c,
3005 		for_each_btree_key_commit(trans, iter,
3006 				BTREE_ID_extents, POS_MIN,
3007 				BTREE_ITER_intent|BTREE_ITER_prefetch|
3008 				BTREE_ITER_all_snapshots, k,
3009 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
3010 			fix_reflink_p_key(trans, &iter, k)));
3011 	bch_err_fn(c, ret);
3012 	return ret;
3013 }
3014 
3015 #ifndef NO_BCACHEFS_CHARDEV
3016 
3017 struct fsck_thread {
3018 	struct thread_with_stdio thr;
3019 	struct bch_fs		*c;
3020 	struct bch_opts		opts;
3021 };
3022 
bch2_fsck_thread_exit(struct thread_with_stdio * _thr)3023 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
3024 {
3025 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
3026 	kfree(thr);
3027 }
3028 
bch2_fsck_offline_thread_fn(struct thread_with_stdio * stdio)3029 static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
3030 {
3031 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3032 	struct bch_fs *c = thr->c;
3033 
3034 	int ret = PTR_ERR_OR_ZERO(c);
3035 	if (ret)
3036 		return ret;
3037 
3038 	ret = bch2_fs_start(thr->c);
3039 	if (ret)
3040 		goto err;
3041 
3042 	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
3043 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
3044 		ret |= 1;
3045 	}
3046 	if (test_bit(BCH_FS_error, &c->flags)) {
3047 		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
3048 		ret |= 4;
3049 	}
3050 err:
3051 	bch2_fs_stop(c);
3052 	return ret;
3053 }
3054 
3055 static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
3056 	.exit		= bch2_fsck_thread_exit,
3057 	.fn		= bch2_fsck_offline_thread_fn,
3058 };
3059 
bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user * user_arg)3060 long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
3061 {
3062 	struct bch_ioctl_fsck_offline arg;
3063 	struct fsck_thread *thr = NULL;
3064 	darray_str(devs) = {};
3065 	long ret = 0;
3066 
3067 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
3068 		return -EFAULT;
3069 
3070 	if (arg.flags)
3071 		return -EINVAL;
3072 
3073 	if (!capable(CAP_SYS_ADMIN))
3074 		return -EPERM;
3075 
3076 	for (size_t i = 0; i < arg.nr_devs; i++) {
3077 		u64 dev_u64;
3078 		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
3079 		if (ret)
3080 			goto err;
3081 
3082 		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
3083 		ret = PTR_ERR_OR_ZERO(dev_str);
3084 		if (ret)
3085 			goto err;
3086 
3087 		ret = darray_push(&devs, dev_str);
3088 		if (ret) {
3089 			kfree(dev_str);
3090 			goto err;
3091 		}
3092 	}
3093 
3094 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3095 	if (!thr) {
3096 		ret = -ENOMEM;
3097 		goto err;
3098 	}
3099 
3100 	thr->opts = bch2_opts_empty();
3101 
3102 	if (arg.opts) {
3103 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3104 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3105 			bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
3106 		if (!IS_ERR(optstr))
3107 			kfree(optstr);
3108 
3109 		if (ret)
3110 			goto err;
3111 	}
3112 
3113 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
3114 	opt_set(thr->opts, read_only, 1);
3115 	opt_set(thr->opts, ratelimit_errors, 0);
3116 
3117 	/* We need request_key() to be called before we punt to kthread: */
3118 	opt_set(thr->opts, nostart, true);
3119 
3120 	bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
3121 
3122 	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
3123 
3124 	if (!IS_ERR(thr->c) &&
3125 	    thr->c->opts.errors == BCH_ON_ERROR_panic)
3126 		thr->c->opts.errors = BCH_ON_ERROR_ro;
3127 
3128 	ret = __bch2_run_thread_with_stdio(&thr->thr);
3129 out:
3130 	darray_for_each(devs, i)
3131 		kfree(*i);
3132 	darray_exit(&devs);
3133 	return ret;
3134 err:
3135 	if (thr)
3136 		bch2_fsck_thread_exit(&thr->thr);
3137 	pr_err("ret %s", bch2_err_str(ret));
3138 	goto out;
3139 }
3140 
bch2_fsck_online_thread_fn(struct thread_with_stdio * stdio)3141 static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
3142 {
3143 	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
3144 	struct bch_fs *c = thr->c;
3145 
3146 	c->stdio_filter = current;
3147 	c->stdio = &thr->thr.stdio;
3148 
3149 	/*
3150 	 * XXX: can we figure out a way to do this without mucking with c->opts?
3151 	 */
3152 	unsigned old_fix_errors = c->opts.fix_errors;
3153 	if (opt_defined(thr->opts, fix_errors))
3154 		c->opts.fix_errors = thr->opts.fix_errors;
3155 	else
3156 		c->opts.fix_errors = FSCK_FIX_ask;
3157 
3158 	c->opts.fsck = true;
3159 	set_bit(BCH_FS_fsck_running, &c->flags);
3160 
3161 	c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
3162 	int ret = bch2_run_online_recovery_passes(c);
3163 
3164 	clear_bit(BCH_FS_fsck_running, &c->flags);
3165 	bch_err_fn(c, ret);
3166 
3167 	c->stdio = NULL;
3168 	c->stdio_filter = NULL;
3169 	c->opts.fix_errors = old_fix_errors;
3170 
3171 	up(&c->online_fsck_mutex);
3172 	bch2_ro_ref_put(c);
3173 	return ret;
3174 }
3175 
3176 static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
3177 	.exit		= bch2_fsck_thread_exit,
3178 	.fn		= bch2_fsck_online_thread_fn,
3179 };
3180 
bch2_ioctl_fsck_online(struct bch_fs * c,struct bch_ioctl_fsck_online arg)3181 long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
3182 {
3183 	struct fsck_thread *thr = NULL;
3184 	long ret = 0;
3185 
3186 	if (arg.flags)
3187 		return -EINVAL;
3188 
3189 	if (!capable(CAP_SYS_ADMIN))
3190 		return -EPERM;
3191 
3192 	if (!bch2_ro_ref_tryget(c))
3193 		return -EROFS;
3194 
3195 	if (down_trylock(&c->online_fsck_mutex)) {
3196 		bch2_ro_ref_put(c);
3197 		return -EAGAIN;
3198 	}
3199 
3200 	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
3201 	if (!thr) {
3202 		ret = -ENOMEM;
3203 		goto err;
3204 	}
3205 
3206 	thr->c = c;
3207 	thr->opts = bch2_opts_empty();
3208 
3209 	if (arg.opts) {
3210 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
3211 
3212 		ret =   PTR_ERR_OR_ZERO(optstr) ?:
3213 			bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
3214 		if (!IS_ERR(optstr))
3215 			kfree(optstr);
3216 
3217 		if (ret)
3218 			goto err;
3219 	}
3220 
3221 	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
3222 err:
3223 	if (ret < 0) {
3224 		bch_err_fn(c, ret);
3225 		if (thr)
3226 			bch2_fsck_thread_exit(&thr->thr);
3227 		up(&c->online_fsck_mutex);
3228 		bch2_ro_ref_put(c);
3229 	}
3230 	return ret;
3231 }
3232 
3233 #endif /* NO_BCACHEFS_CHARDEV */
3234