1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24 * Copyright (c) 2023, Datto Inc. All rights reserved.
25 * Copyright (c) 2025, Klara, Inc.
26 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
27 * Copyright (c) 2026, TrueNAS.
28 */
29
30
31 #include <sys/zfs_znode.h>
32 #include <sys/zfs_vfsops.h>
33 #include <sys/zfs_vnops.h>
34 #include <sys/zfs_ctldir.h>
35 #include <sys/zpl.h>
36 #include <linux/iversion.h>
37 #include <linux/version.h>
38 #include <linux/vfs_compat.h>
39 #include <linux/fs_context.h>
40
41 /*
42 * What to do when the last reference to an inode is released. If 0, the kernel
43 * will cache it on the superblock. If 1, the inode will be freed immediately.
44 * See zpl_drop_inode().
45 */
46 int zfs_delete_inode = 0;
47
48 /*
49 * What to do when the last reference to a dentry is released. If 0, the kernel
50 * will cache it until the entry (file) is destroyed. If 1, the dentry will be
51 * marked for cleanup, at which time its inode reference will be released. See
52 * zpl_dentry_delete().
53 */
54 int zfs_delete_dentry = 0;
55
56 static struct inode *
zpl_inode_alloc(struct super_block * sb)57 zpl_inode_alloc(struct super_block *sb)
58 {
59 struct inode *ip;
60
61 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
62 inode_set_iversion(ip, 1);
63
64 return (ip);
65 }
66
67 #ifdef HAVE_SOPS_FREE_INODE
68 static void
zpl_inode_free(struct inode * ip)69 zpl_inode_free(struct inode *ip)
70 {
71 ASSERT0(atomic_read(&ip->i_count));
72 zfs_inode_free(ip);
73 }
74 #endif
75
76 static void
zpl_inode_destroy(struct inode * ip)77 zpl_inode_destroy(struct inode *ip)
78 {
79 ASSERT0(atomic_read(&ip->i_count));
80 zfs_inode_destroy(ip);
81 }
82
83 /*
84 * Called from __mark_inode_dirty() to reflect that something in the
85 * inode has changed. We use it to ensure the znode system attributes
86 * are always strictly update to date with respect to the inode.
87 */
88 static void
zpl_dirty_inode(struct inode * ip,int flags)89 zpl_dirty_inode(struct inode *ip, int flags)
90 {
91 fstrans_cookie_t cookie;
92
93 cookie = spl_fstrans_mark();
94 zfs_dirty_inode(ip, flags);
95 spl_fstrans_unmark(cookie);
96 }
97
98 /*
99 * ->drop_inode() is called when the last reference to an inode is released.
100 * Its return value indicates if the inode should be destroyed immediately, or
101 * cached on the superblock structure.
102 *
103 * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
104 * "destroy immediately" if the inode is unhashed and has no links (roughly: no
105 * longer exists on disk). On datasets with millions of rarely-accessed files,
106 * this can cause a large amount of memory to be "pinned" by cached inodes,
107 * which in turn pin their associated dnodes and dbufs, until the kernel starts
108 * reporting memory pressure and requests OpenZFS release some memory (see
109 * zfs_prune()).
110 *
111 * When set to 1, we call generic_delete_inode(), which always returns "destroy
112 * immediately", resulting in inodes being destroyed immediately, releasing
113 * their associated dnodes and dbufs to the dbuf cached and the ARC to be
114 * evicted as normal.
115 *
116 * Note that the "last reference" doesn't always mean the last _userspace_
117 * reference; the dentry cache also holds a reference, so "busy" inodes will
118 * still be kept alive that way (subject to dcache tuning).
119 */
120 static int
zpl_drop_inode(struct inode * ip)121 zpl_drop_inode(struct inode *ip)
122 {
123 if (zfs_delete_inode)
124 return (generic_delete_inode(ip));
125 return (generic_drop_inode(ip));
126 }
127
128 /*
129 * The ->evict_inode() callback must minimally truncate the inode pages,
130 * and call clear_inode(). For 2.6.35 and later kernels this will
131 * simply update the inode state, with the sync occurring before the
132 * truncate in evict(). For earlier kernels clear_inode() maps to
133 * end_writeback() which is responsible for completing all outstanding
134 * write back. In either case, once this is done it is safe to cleanup
135 * any remaining inode specific data via zfs_inactive().
136 * remaining filesystem specific data.
137 */
138 static void
zpl_evict_inode(struct inode * ip)139 zpl_evict_inode(struct inode *ip)
140 {
141 fstrans_cookie_t cookie;
142
143 cookie = spl_fstrans_mark();
144 truncate_setsize(ip, 0);
145 clear_inode(ip);
146 zfs_inactive(ip);
147 spl_fstrans_unmark(cookie);
148 }
149
150 static void
zpl_put_super(struct super_block * sb)151 zpl_put_super(struct super_block *sb)
152 {
153 fstrans_cookie_t cookie;
154 int error;
155
156 cookie = spl_fstrans_mark();
157 error = -zfs_umount(sb);
158 spl_fstrans_unmark(cookie);
159 ASSERT3S(error, <=, 0);
160 }
161
162 /*
163 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
164 * syscalls, via sb->s_op->sync_fs().
165 *
166 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
167 * sync_filesystem() would ignore the return from sync_fs(), instead only
168 * considing the error from syncing the underlying block device (sb->s_dev).
169 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
170 * us to report a sync directly.
171 *
172 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
173 * error store `s_wb_err`, to carry errors seen on page writeback since the
174 * last call to syncfs(). If sync_filesystem() does not return an error, any
175 * existing writeback error on the superblock will be used instead (and cleared
176 * either way). We don't use this (page writeback is a different thing for us),
177 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
178 *
179 * Before 5.8, we have no other good options - no matter what happens, the
180 * userspace program will be told the call has succeeded, and so we must make
181 * it so, Therefore, when we are asked to wait for sync to complete (wait ==
182 * 1), if zfs_sync() has returned an error we have no choice but to block,
183 * regardless of the reason.
184 *
185 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
186 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
187 * mainline Linux series at time of writing), and has likely been backported to
188 * vendor kernels before 5.8. We don't really want to use a workaround when we
189 * don't have to, but we can't really detect whether or not sync_filesystem()
190 * will return our errors (without a difficult runtime test anyway). So, we use
191 * a static version check: any kernel reporting its version as 5.17+ will use a
192 * direct error return, otherwise, we'll either use s_wb_err if it was detected
193 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
194 * block to ensure the correct semantics.
195 *
196 * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
197 */
198 static int
zpl_sync_fs(struct super_block * sb,int wait)199 zpl_sync_fs(struct super_block *sb, int wait)
200 {
201 fstrans_cookie_t cookie;
202 cred_t *cr = CRED();
203 int error;
204
205 crhold(cr);
206 cookie = spl_fstrans_mark();
207 error = -zfs_sync(sb, wait, cr);
208
209 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
210 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
211 if (error && wait)
212 errseq_set(&sb->s_wb_err, error);
213 #else
214 if (error && wait) {
215 zfsvfs_t *zfsvfs = sb->s_fs_info;
216 ASSERT3P(zfsvfs, !=, NULL);
217 if (zfs_enter(zfsvfs, FTAG) == 0) {
218 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
219 zfs_exit(zfsvfs, FTAG);
220 error = 0;
221 }
222 }
223 #endif
224 #endif /* < 5.17.0 */
225
226 spl_fstrans_unmark(cookie);
227 crfree(cr);
228
229 ASSERT3S(error, <=, 0);
230 return (error);
231 }
232
233 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)234 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
235 {
236 fstrans_cookie_t cookie;
237 int error;
238
239 cookie = spl_fstrans_mark();
240 error = -zfs_statvfs(dentry->d_inode, statp);
241 spl_fstrans_unmark(cookie);
242 ASSERT3S(error, <=, 0);
243
244 /*
245 * If required by a 32-bit system call, dynamically scale the
246 * block size up to 16MiB and decrease the block counts. This
247 * allows for a maximum size of 64EiB to be reported. The file
248 * counts must be artificially capped at 2^32-1.
249 */
250 if (unlikely(zpl_is_32bit_api())) {
251 while (statp->f_blocks > UINT32_MAX &&
252 statp->f_bsize < SPA_MAXBLOCKSIZE) {
253 statp->f_frsize <<= 1;
254 statp->f_bsize <<= 1;
255
256 statp->f_blocks >>= 1;
257 statp->f_bfree >>= 1;
258 statp->f_bavail >>= 1;
259 }
260
261 uint64_t usedobjs = statp->f_files - statp->f_ffree;
262 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
263 statp->f_files = statp->f_ffree + usedobjs;
264 }
265
266 return (error);
267 }
268
269 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)270 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
271 {
272 int error;
273 if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
274 return (error);
275
276 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
277 dmu_objset_name(zfsvfs->z_os, fsname);
278
279 for (int i = 0; fsname[i] != 0; i++) {
280 /*
281 * Spaces in the dataset name must be converted to their
282 * octal escape sequence for getmntent(3) to correctly
283 * parse then fsname portion of /proc/self/mounts.
284 */
285 if (fsname[i] == ' ') {
286 seq_puts(seq, "\\040");
287 } else {
288 seq_putc(seq, fsname[i]);
289 }
290 }
291
292 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
293
294 zpl_exit(zfsvfs, FTAG);
295
296 return (0);
297 }
298
299 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)300 zpl_show_devname(struct seq_file *seq, struct dentry *root)
301 {
302 return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
303 }
304
305 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)306 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
307 {
308 seq_printf(seq, ",%s",
309 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
310
311 #ifdef CONFIG_FS_POSIX_ACL
312 switch (zfsvfs->z_acl_type) {
313 case ZFS_ACLTYPE_POSIX:
314 seq_puts(seq, ",posixacl");
315 break;
316 default:
317 seq_puts(seq, ",noacl");
318 break;
319 }
320 #endif /* CONFIG_FS_POSIX_ACL */
321
322 switch (zfsvfs->z_case) {
323 case ZFS_CASE_SENSITIVE:
324 seq_puts(seq, ",casesensitive");
325 break;
326 case ZFS_CASE_INSENSITIVE:
327 seq_puts(seq, ",caseinsensitive");
328 break;
329 default:
330 seq_puts(seq, ",casemixed");
331 break;
332 }
333
334 return (0);
335 }
336
337 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)338 zpl_show_options(struct seq_file *seq, struct dentry *root)
339 {
340 return (__zpl_show_options(seq, root->d_sb->s_fs_info));
341 }
342
343 static int
zpl_test_super(struct super_block * s,void * data)344 zpl_test_super(struct super_block *s, void *data)
345 {
346 zfsvfs_t *zfsvfs = s->s_fs_info;
347 objset_t *os = data;
348 /*
349 * If the os doesn't match the z_os in the super_block, assume it is
350 * not a match. Matching would imply a multimount of a dataset. It is
351 * possible that during a multimount, there is a simultaneous operation
352 * that changes the z_os, e.g., rollback, where the match will be
353 * missed, but in that case the user will get an EBUSY.
354 */
355 return (zfsvfs != NULL && os == zfsvfs->z_os);
356 }
357
358 static void
zpl_kill_sb(struct super_block * sb)359 zpl_kill_sb(struct super_block *sb)
360 {
361 zfs_preumount(sb);
362 kill_anon_super(sb);
363 }
364
365 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)366 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
367 {
368 struct super_block *sb = (struct super_block *)arg;
369 int objects = 0;
370
371 /*
372 * Ensure the superblock is not in the process of being torn down.
373 */
374 #ifdef HAVE_SB_DYING
375 if (down_read_trylock(&sb->s_umount)) {
376 if (!(sb->s_flags & SB_DYING) && sb->s_root &&
377 (sb->s_flags & SB_BORN)) {
378 (void) zfs_prune(sb, nr_to_scan, &objects);
379 }
380 up_read(&sb->s_umount);
381 }
382 #else
383 if (down_read_trylock(&sb->s_umount)) {
384 if (!hlist_unhashed(&sb->s_instances) &&
385 sb->s_root && (sb->s_flags & SB_BORN)) {
386 (void) zfs_prune(sb, nr_to_scan, &objects);
387 }
388 up_read(&sb->s_umount);
389 }
390 #endif
391 }
392
393 static int
zpl_parse_monolithic(struct fs_context * fc,void * data)394 zpl_parse_monolithic(struct fs_context *fc, void *data)
395 {
396 /*
397 * We do options parsing in zfs_domount(); just stash the options blob
398 * in the fs_context so we can pass it down later.
399 */
400 fc->fs_private = data;
401 return (0);
402 }
403
404 static int
zpl_get_tree(struct fs_context * fc)405 zpl_get_tree(struct fs_context *fc)
406 {
407 struct super_block *sb;
408 objset_t *os;
409 boolean_t issnap = B_FALSE;
410 int err;
411
412 err = dmu_objset_hold(fc->source, FTAG, &os);
413 if (err)
414 return (-err);
415
416 /*
417 * The dsl pool lock must be released prior to calling sget().
418 * It is possible sget() may block on the lock in grab_super()
419 * while deactivate_super() holds that same lock and waits for
420 * a txg sync. If the dsl_pool lock is held over sget()
421 * this can prevent the pool sync and cause a deadlock.
422 */
423 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
424 dsl_pool_rele(dmu_objset_pool(os), FTAG);
425
426 sb = sget(fc->fs_type, zpl_test_super, set_anon_super,
427 fc->sb_flags, os);
428
429 /*
430 * Recheck with the lock held to prevent mounting the wrong dataset
431 * since z_os can be stale when the teardown lock is held.
432 *
433 * We can't do this in zpl_test_super in since it's under spinlock and
434 * also s_umount lock is not held there so it would race with
435 * zfs_umount and zfsvfs can be freed.
436 */
437 if (!IS_ERR(sb) && sb->s_fs_info != NULL) {
438 zfsvfs_t *zfsvfs = sb->s_fs_info;
439 if (zpl_enter(zfsvfs, FTAG) == 0) {
440 if (os != zfsvfs->z_os)
441 err = SET_ERROR(EBUSY);
442 issnap = zfsvfs->z_issnap;
443 zpl_exit(zfsvfs, FTAG);
444 } else {
445 err = SET_ERROR(EBUSY);
446 }
447 }
448 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
449 dsl_dataset_rele(dmu_objset_ds(os), FTAG);
450
451 if (IS_ERR(sb))
452 return (PTR_ERR(sb));
453
454 if (err) {
455 deactivate_locked_super(sb);
456 return (-err);
457 }
458
459 if (sb->s_root == NULL) {
460 zfs_mnt_t zm = {
461 .mnt_osname = fc->source,
462 .mnt_data = fc->fs_private,
463 };
464
465 fstrans_cookie_t cookie = spl_fstrans_mark();
466 err = zfs_domount(sb, &zm, fc->sb_flags & SB_SILENT ? 1 : 0);
467 spl_fstrans_unmark(cookie);
468
469 if (err) {
470 deactivate_locked_super(sb);
471 return (-err);
472 }
473
474 sb->s_flags |= SB_ACTIVE;
475 } else if (!issnap && ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)) {
476 /*
477 * Skip ro check for snap since snap is always ro regardless
478 * ro flag is passed by mount or not.
479 */
480 deactivate_locked_super(sb);
481 return (-SET_ERROR(EBUSY));
482 }
483
484 struct dentry *root = dget(sb->s_root);
485 if (IS_ERR(root))
486 return (PTR_ERR(root));
487
488 fc->root = root;
489 return (0);
490 }
491
492 static int
zpl_reconfigure(struct fs_context * fc)493 zpl_reconfigure(struct fs_context *fc)
494 {
495 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = fc->fs_private };
496 fstrans_cookie_t cookie;
497 int error;
498
499 cookie = spl_fstrans_mark();
500 error = -zfs_remount(fc->root->d_sb, &fc->sb_flags, &zm);
501 spl_fstrans_unmark(cookie);
502 ASSERT3S(error, <=, 0);
503
504 return (error);
505 }
506
507 const struct fs_context_operations zpl_fs_context_operations = {
508 .parse_monolithic = zpl_parse_monolithic,
509 .get_tree = zpl_get_tree,
510 .reconfigure = zpl_reconfigure,
511 };
512
513 static int
zpl_init_fs_context(struct fs_context * fc)514 zpl_init_fs_context(struct fs_context *fc)
515 {
516 fc->ops = &zpl_fs_context_operations;
517 return (0);
518 }
519
520 const struct super_operations zpl_super_operations = {
521 .alloc_inode = zpl_inode_alloc,
522 #ifdef HAVE_SOPS_FREE_INODE
523 .free_inode = zpl_inode_free,
524 #endif
525 .destroy_inode = zpl_inode_destroy,
526 .dirty_inode = zpl_dirty_inode,
527 .write_inode = NULL,
528 .drop_inode = zpl_drop_inode,
529 .evict_inode = zpl_evict_inode,
530 .put_super = zpl_put_super,
531 .sync_fs = zpl_sync_fs,
532 .statfs = zpl_statfs,
533 .show_devname = zpl_show_devname,
534 .show_options = zpl_show_options,
535 .show_stats = NULL,
536 };
537
538 /*
539 * ->d_delete() is called when the last reference to a dentry is released. Its
540 * return value indicates if the dentry should be destroyed immediately, or
541 * retained in the dentry cache.
542 *
543 * By default (zfs_delete_dentry=0) the kernel will always cache unused
544 * entries. Each dentry holds an inode reference, so cached dentries can hold
545 * the final inode reference indefinitely, leading to the inode and its related
546 * data being pinned (see zpl_drop_inode()).
547 *
548 * When set to 1, we signal that the dentry should be destroyed immediately and
549 * never cached. This reduces memory usage, at the cost of higher overheads to
550 * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
551 * reloaded and reinflated.
552 *
553 * Note that userspace does not have direct control over dentry references and
554 * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
555 * (eg vm.vfs_cache_pressure).
556 */
557 static int
zpl_dentry_delete(const struct dentry * dentry)558 zpl_dentry_delete(const struct dentry *dentry)
559 {
560 return (zfs_delete_dentry ? 1 : 0);
561 }
562
563 const struct dentry_operations zpl_dentry_operations = {
564 .d_delete = zpl_dentry_delete,
565 };
566
567 struct file_system_type zpl_fs_type = {
568 .owner = THIS_MODULE,
569 .name = ZFS_DRIVER,
570 #if defined(HAVE_IDMAP_MNT_API)
571 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
572 #else
573 .fs_flags = FS_USERNS_MOUNT,
574 #endif
575 .init_fs_context = zpl_init_fs_context,
576 .kill_sb = zpl_kill_sb,
577 };
578
579 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
580 "Delete inodes as soon as the last reference is released.");
581
582 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
583 "Delete dentries from dentry cache as soon as the last reference is "
584 "released.");
585