xref: /src/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision 80aae8a3f8aa70712930664572be9e6885dc0be7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2023, Datto Inc. All rights reserved.
25  * Copyright (c) 2025, Klara, Inc.
26  * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
27  * Copyright (c) 2026, TrueNAS.
28  */
29 
30 
31 #include <sys/zfs_znode.h>
32 #include <sys/zfs_vfsops.h>
33 #include <sys/zfs_vnops.h>
34 #include <sys/zfs_ctldir.h>
35 #include <sys/zpl.h>
36 #include <linux/iversion.h>
37 #include <linux/version.h>
38 #include <linux/vfs_compat.h>
39 #include <linux/fs_context.h>
40 
41 /*
42  * What to do when the last reference to an inode is released. If 0, the kernel
43  * will cache it on the superblock. If 1, the inode will be freed immediately.
44  * See zpl_drop_inode().
45  */
46 int zfs_delete_inode = 0;
47 
48 /*
49  * What to do when the last reference to a dentry is released. If 0, the kernel
50  * will cache it until the entry (file) is destroyed. If 1, the dentry will be
51  * marked for cleanup, at which time its inode reference will be released. See
52  * zpl_dentry_delete().
53  */
54 int zfs_delete_dentry = 0;
55 
56 static struct inode *
zpl_inode_alloc(struct super_block * sb)57 zpl_inode_alloc(struct super_block *sb)
58 {
59 	struct inode *ip;
60 
61 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
62 	inode_set_iversion(ip, 1);
63 
64 	return (ip);
65 }
66 
67 #ifdef HAVE_SOPS_FREE_INODE
68 static void
zpl_inode_free(struct inode * ip)69 zpl_inode_free(struct inode *ip)
70 {
71 	ASSERT0(atomic_read(&ip->i_count));
72 	zfs_inode_free(ip);
73 }
74 #endif
75 
76 static void
zpl_inode_destroy(struct inode * ip)77 zpl_inode_destroy(struct inode *ip)
78 {
79 	ASSERT0(atomic_read(&ip->i_count));
80 	zfs_inode_destroy(ip);
81 }
82 
83 /*
84  * Called from __mark_inode_dirty() to reflect that something in the
85  * inode has changed.  We use it to ensure the znode system attributes
86  * are always strictly update to date with respect to the inode.
87  */
88 static void
zpl_dirty_inode(struct inode * ip,int flags)89 zpl_dirty_inode(struct inode *ip, int flags)
90 {
91 	fstrans_cookie_t cookie;
92 
93 	cookie = spl_fstrans_mark();
94 	zfs_dirty_inode(ip, flags);
95 	spl_fstrans_unmark(cookie);
96 }
97 
98 /*
99  * ->drop_inode() is called when the last reference to an inode is released.
100  * Its return value indicates if the inode should be destroyed immediately, or
101  * cached on the superblock structure.
102  *
103  * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
104  * "destroy immediately" if the inode is unhashed and has no links (roughly: no
105  * longer exists on disk). On datasets with millions of rarely-accessed files,
106  * this can cause a large amount of memory to be "pinned" by cached inodes,
107  * which in turn pin their associated dnodes and dbufs, until the kernel starts
108  * reporting memory pressure and requests OpenZFS release some memory (see
109  * zfs_prune()).
110  *
111  * When set to 1, we call generic_delete_inode(), which always returns "destroy
112  * immediately", resulting in inodes being destroyed immediately, releasing
113  * their associated dnodes and dbufs to the dbuf cached and the ARC to be
114  * evicted as normal.
115  *
116  * Note that the "last reference" doesn't always mean the last _userspace_
117  * reference; the dentry cache also holds a reference, so "busy" inodes will
118  * still be kept alive that way (subject to dcache tuning).
119  */
120 static int
zpl_drop_inode(struct inode * ip)121 zpl_drop_inode(struct inode *ip)
122 {
123 	if (zfs_delete_inode)
124 		return (generic_delete_inode(ip));
125 	return (generic_drop_inode(ip));
126 }
127 
128 /*
129  * The ->evict_inode() callback must minimally truncate the inode pages,
130  * and call clear_inode().  For 2.6.35 and later kernels this will
131  * simply update the inode state, with the sync occurring before the
132  * truncate in evict().  For earlier kernels clear_inode() maps to
133  * end_writeback() which is responsible for completing all outstanding
134  * write back.  In either case, once this is done it is safe to cleanup
135  * any remaining inode specific data via zfs_inactive().
136  * remaining filesystem specific data.
137  */
138 static void
zpl_evict_inode(struct inode * ip)139 zpl_evict_inode(struct inode *ip)
140 {
141 	fstrans_cookie_t cookie;
142 
143 	cookie = spl_fstrans_mark();
144 	truncate_setsize(ip, 0);
145 	clear_inode(ip);
146 	zfs_inactive(ip);
147 	spl_fstrans_unmark(cookie);
148 }
149 
150 static void
zpl_put_super(struct super_block * sb)151 zpl_put_super(struct super_block *sb)
152 {
153 	fstrans_cookie_t cookie;
154 	int error;
155 
156 	cookie = spl_fstrans_mark();
157 	error = -zfs_umount(sb);
158 	spl_fstrans_unmark(cookie);
159 	ASSERT3S(error, <=, 0);
160 }
161 
162 /*
163  * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
164  * syscalls, via sb->s_op->sync_fs().
165  *
166  * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
167  * sync_filesystem() would ignore the return from sync_fs(), instead only
168  * considing the error from syncing the underlying block device (sb->s_dev).
169  * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
170  * us to report a sync directly.
171  *
172  * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
173  * error store `s_wb_err`, to carry errors seen on page writeback since the
174  * last call to syncfs(). If sync_filesystem() does not return an error, any
175  * existing writeback error on the superblock will be used instead (and cleared
176  * either way). We don't use this (page writeback is a different thing for us),
177  * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
178  *
179  * Before 5.8, we have no other good options - no matter what happens, the
180  * userspace program will be told the call has succeeded, and so we must make
181  * it so, Therefore, when we are asked to wait for sync to complete (wait ==
182  * 1), if zfs_sync() has returned an error we have no choice but to block,
183  * regardless of the reason.
184  *
185  * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
186  * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
187  * mainline Linux series at time of writing), and has likely been backported to
188  * vendor kernels before 5.8. We don't really want to use a workaround when we
189  * don't have to, but we can't really detect whether or not sync_filesystem()
190  * will return our errors (without a difficult runtime test anyway). So, we use
191  * a static version check: any kernel reporting its version as 5.17+ will use a
192  * direct error return, otherwise, we'll either use s_wb_err if it was detected
193  * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
194  * block to ensure the correct semantics.
195  *
196  * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
197  */
198 static int
zpl_sync_fs(struct super_block * sb,int wait)199 zpl_sync_fs(struct super_block *sb, int wait)
200 {
201 	fstrans_cookie_t cookie;
202 	cred_t *cr = CRED();
203 	int error;
204 
205 	crhold(cr);
206 	cookie = spl_fstrans_mark();
207 	error = -zfs_sync(sb, wait, cr);
208 
209 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
210 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
211 	if (error && wait)
212 		errseq_set(&sb->s_wb_err, error);
213 #else
214 	if (error && wait) {
215 		zfsvfs_t *zfsvfs = sb->s_fs_info;
216 		ASSERT3P(zfsvfs, !=, NULL);
217 		if (zfs_enter(zfsvfs, FTAG) == 0) {
218 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
219 			zfs_exit(zfsvfs, FTAG);
220 			error = 0;
221 		}
222 	}
223 #endif
224 #endif /* < 5.17.0 */
225 
226 	spl_fstrans_unmark(cookie);
227 	crfree(cr);
228 
229 	ASSERT3S(error, <=, 0);
230 	return (error);
231 }
232 
233 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)234 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
235 {
236 	fstrans_cookie_t cookie;
237 	int error;
238 
239 	cookie = spl_fstrans_mark();
240 	error = -zfs_statvfs(dentry->d_inode, statp);
241 	spl_fstrans_unmark(cookie);
242 	ASSERT3S(error, <=, 0);
243 
244 	/*
245 	 * If required by a 32-bit system call, dynamically scale the
246 	 * block size up to 16MiB and decrease the block counts.  This
247 	 * allows for a maximum size of 64EiB to be reported.  The file
248 	 * counts must be artificially capped at 2^32-1.
249 	 */
250 	if (unlikely(zpl_is_32bit_api())) {
251 		while (statp->f_blocks > UINT32_MAX &&
252 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
253 			statp->f_frsize <<= 1;
254 			statp->f_bsize <<= 1;
255 
256 			statp->f_blocks >>= 1;
257 			statp->f_bfree >>= 1;
258 			statp->f_bavail >>= 1;
259 		}
260 
261 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
262 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
263 		statp->f_files = statp->f_ffree + usedobjs;
264 	}
265 
266 	return (error);
267 }
268 
269 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)270 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
271 {
272 	int error;
273 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
274 		return (error);
275 
276 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
277 	dmu_objset_name(zfsvfs->z_os, fsname);
278 
279 	for (int i = 0; fsname[i] != 0; i++) {
280 		/*
281 		 * Spaces in the dataset name must be converted to their
282 		 * octal escape sequence for getmntent(3) to correctly
283 		 * parse then fsname portion of /proc/self/mounts.
284 		 */
285 		if (fsname[i] == ' ') {
286 			seq_puts(seq, "\\040");
287 		} else {
288 			seq_putc(seq, fsname[i]);
289 		}
290 	}
291 
292 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
293 
294 	zpl_exit(zfsvfs, FTAG);
295 
296 	return (0);
297 }
298 
299 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)300 zpl_show_devname(struct seq_file *seq, struct dentry *root)
301 {
302 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
303 }
304 
305 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)306 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
307 {
308 	seq_printf(seq, ",%s",
309 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
310 
311 #ifdef CONFIG_FS_POSIX_ACL
312 	switch (zfsvfs->z_acl_type) {
313 	case ZFS_ACLTYPE_POSIX:
314 		seq_puts(seq, ",posixacl");
315 		break;
316 	default:
317 		seq_puts(seq, ",noacl");
318 		break;
319 	}
320 #endif /* CONFIG_FS_POSIX_ACL */
321 
322 	switch (zfsvfs->z_case) {
323 	case ZFS_CASE_SENSITIVE:
324 		seq_puts(seq, ",casesensitive");
325 		break;
326 	case ZFS_CASE_INSENSITIVE:
327 		seq_puts(seq, ",caseinsensitive");
328 		break;
329 	default:
330 		seq_puts(seq, ",casemixed");
331 		break;
332 	}
333 
334 	return (0);
335 }
336 
337 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)338 zpl_show_options(struct seq_file *seq, struct dentry *root)
339 {
340 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
341 }
342 
343 static int
zpl_test_super(struct super_block * s,void * data)344 zpl_test_super(struct super_block *s, void *data)
345 {
346 	zfsvfs_t *zfsvfs = s->s_fs_info;
347 	objset_t *os = data;
348 	/*
349 	 * If the os doesn't match the z_os in the super_block, assume it is
350 	 * not a match. Matching would imply a multimount of a dataset. It is
351 	 * possible that during a multimount, there is a simultaneous operation
352 	 * that changes the z_os, e.g., rollback, where the match will be
353 	 * missed, but in that case the user will get an EBUSY.
354 	 */
355 	return (zfsvfs != NULL && os == zfsvfs->z_os);
356 }
357 
358 static void
zpl_kill_sb(struct super_block * sb)359 zpl_kill_sb(struct super_block *sb)
360 {
361 	zfs_preumount(sb);
362 	kill_anon_super(sb);
363 }
364 
365 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)366 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
367 {
368 	struct super_block *sb = (struct super_block *)arg;
369 	int objects = 0;
370 
371 	/*
372 	 * Ensure the superblock is not in the process of being torn down.
373 	 */
374 #ifdef HAVE_SB_DYING
375 	if (down_read_trylock(&sb->s_umount)) {
376 		if (!(sb->s_flags & SB_DYING) && sb->s_root &&
377 		    (sb->s_flags & SB_BORN)) {
378 			(void) zfs_prune(sb, nr_to_scan, &objects);
379 		}
380 		up_read(&sb->s_umount);
381 	}
382 #else
383 	if (down_read_trylock(&sb->s_umount)) {
384 		if (!hlist_unhashed(&sb->s_instances) &&
385 		    sb->s_root && (sb->s_flags & SB_BORN)) {
386 			(void) zfs_prune(sb, nr_to_scan, &objects);
387 		}
388 		up_read(&sb->s_umount);
389 	}
390 #endif
391 }
392 
393 static int
zpl_parse_monolithic(struct fs_context * fc,void * data)394 zpl_parse_monolithic(struct fs_context *fc, void *data)
395 {
396 	/*
397 	 * We do options parsing in zfs_domount(); just stash the options blob
398 	 * in the fs_context so we can pass it down later.
399 	 */
400 	fc->fs_private = data;
401 	return (0);
402 }
403 
404 static int
zpl_get_tree(struct fs_context * fc)405 zpl_get_tree(struct fs_context *fc)
406 {
407 	struct super_block *sb;
408 	objset_t *os;
409 	boolean_t issnap = B_FALSE;
410 	int err;
411 
412 	err = dmu_objset_hold(fc->source, FTAG, &os);
413 	if (err)
414 		return (-err);
415 
416 	/*
417 	 * The dsl pool lock must be released prior to calling sget().
418 	 * It is possible sget() may block on the lock in grab_super()
419 	 * while deactivate_super() holds that same lock and waits for
420 	 * a txg sync.  If the dsl_pool lock is held over sget()
421 	 * this can prevent the pool sync and cause a deadlock.
422 	 */
423 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
424 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
425 
426 	sb = sget(fc->fs_type, zpl_test_super, set_anon_super,
427 	    fc->sb_flags, os);
428 
429 	/*
430 	 * Recheck with the lock held to prevent mounting the wrong dataset
431 	 * since z_os can be stale when the teardown lock is held.
432 	 *
433 	 * We can't do this in zpl_test_super in since it's under spinlock and
434 	 * also s_umount lock is not held there so it would race with
435 	 * zfs_umount and zfsvfs can be freed.
436 	 */
437 	if (!IS_ERR(sb) && sb->s_fs_info != NULL) {
438 		zfsvfs_t *zfsvfs = sb->s_fs_info;
439 		if (zpl_enter(zfsvfs, FTAG) == 0) {
440 			if (os != zfsvfs->z_os)
441 				err = SET_ERROR(EBUSY);
442 			issnap = zfsvfs->z_issnap;
443 			zpl_exit(zfsvfs, FTAG);
444 		} else {
445 			err = SET_ERROR(EBUSY);
446 		}
447 	}
448 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
449 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
450 
451 	if (IS_ERR(sb))
452 		return (PTR_ERR(sb));
453 
454 	if (err) {
455 		deactivate_locked_super(sb);
456 		return (-err);
457 	}
458 
459 	if (sb->s_root == NULL) {
460 		zfs_mnt_t zm = {
461 		    .mnt_osname = fc->source,
462 		    .mnt_data = fc->fs_private,
463 		};
464 
465 		fstrans_cookie_t cookie = spl_fstrans_mark();
466 		err = zfs_domount(sb, &zm, fc->sb_flags & SB_SILENT ? 1 : 0);
467 		spl_fstrans_unmark(cookie);
468 
469 		if (err) {
470 			deactivate_locked_super(sb);
471 			return (-err);
472 		}
473 
474 		sb->s_flags |= SB_ACTIVE;
475 	} else if (!issnap && ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)) {
476 		/*
477 		 * Skip ro check for snap since snap is always ro regardless
478 		 * ro flag is passed by mount or not.
479 		 */
480 		deactivate_locked_super(sb);
481 		return (-SET_ERROR(EBUSY));
482 	}
483 
484 	struct dentry *root = dget(sb->s_root);
485 	if (IS_ERR(root))
486 		return (PTR_ERR(root));
487 
488 	fc->root = root;
489 	return (0);
490 }
491 
492 static int
zpl_reconfigure(struct fs_context * fc)493 zpl_reconfigure(struct fs_context *fc)
494 {
495 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = fc->fs_private };
496 	fstrans_cookie_t cookie;
497 	int error;
498 
499 	cookie = spl_fstrans_mark();
500 	error = -zfs_remount(fc->root->d_sb, &fc->sb_flags, &zm);
501 	spl_fstrans_unmark(cookie);
502 	ASSERT3S(error, <=, 0);
503 
504 	return (error);
505 }
506 
507 const struct fs_context_operations zpl_fs_context_operations = {
508 	.parse_monolithic	= zpl_parse_monolithic,
509 	.get_tree		= zpl_get_tree,
510 	.reconfigure		= zpl_reconfigure,
511 };
512 
513 static int
zpl_init_fs_context(struct fs_context * fc)514 zpl_init_fs_context(struct fs_context *fc)
515 {
516 	fc->ops = &zpl_fs_context_operations;
517 	return (0);
518 }
519 
520 const struct super_operations zpl_super_operations = {
521 	.alloc_inode		= zpl_inode_alloc,
522 #ifdef HAVE_SOPS_FREE_INODE
523 	.free_inode		= zpl_inode_free,
524 #endif
525 	.destroy_inode		= zpl_inode_destroy,
526 	.dirty_inode		= zpl_dirty_inode,
527 	.write_inode		= NULL,
528 	.drop_inode		= zpl_drop_inode,
529 	.evict_inode		= zpl_evict_inode,
530 	.put_super		= zpl_put_super,
531 	.sync_fs		= zpl_sync_fs,
532 	.statfs			= zpl_statfs,
533 	.show_devname		= zpl_show_devname,
534 	.show_options		= zpl_show_options,
535 	.show_stats		= NULL,
536 };
537 
538 /*
539  * ->d_delete() is called when the last reference to a dentry is released. Its
540  *  return value indicates if the dentry should be destroyed immediately, or
541  *  retained in the dentry cache.
542  *
543  * By default (zfs_delete_dentry=0) the kernel will always cache unused
544  * entries.  Each dentry holds an inode reference, so cached dentries can hold
545  * the final inode reference indefinitely, leading to the inode and its related
546  * data being pinned (see zpl_drop_inode()).
547  *
548  * When set to 1, we signal that the dentry should be destroyed immediately and
549  * never cached. This reduces memory usage, at the cost of higher overheads to
550  * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
551  * reloaded and reinflated.
552  *
553  * Note that userspace does not have direct control over dentry references and
554  * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
555  * (eg vm.vfs_cache_pressure).
556  */
557 static int
zpl_dentry_delete(const struct dentry * dentry)558 zpl_dentry_delete(const struct dentry *dentry)
559 {
560 	return (zfs_delete_dentry ? 1 : 0);
561 }
562 
563 const struct dentry_operations zpl_dentry_operations = {
564 	.d_delete = zpl_dentry_delete,
565 };
566 
567 struct file_system_type zpl_fs_type = {
568 	.owner			= THIS_MODULE,
569 	.name			= ZFS_DRIVER,
570 #if defined(HAVE_IDMAP_MNT_API)
571 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
572 #else
573 	.fs_flags		= FS_USERNS_MOUNT,
574 #endif
575 	.init_fs_context	= zpl_init_fs_context,
576 	.kill_sb		= zpl_kill_sb,
577 };
578 
579 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
580 	"Delete inodes as soon as the last reference is released.");
581 
582 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
583 	"Delete dentries from dentry cache as soon as the last reference is "
584 	"released.");
585