1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 25 * All rights reserved. 26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 29 */ 30 31 /* Portions Copyright 2010 Robert Milkowski */ 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/sysmacros.h> 38 #include <sys/kmem.h> 39 #include <sys/acl.h> 40 #include <sys/vnode.h> 41 #include <sys/vfs.h> 42 #include <sys/mntent.h> 43 #include <sys/mount.h> 44 #include <sys/cmn_err.h> 45 #include <sys/zfs_znode.h> 46 #include <sys/zfs_vnops.h> 47 #include <sys/zfs_dir.h> 48 #include <sys/zil.h> 49 #include <sys/fs/zfs.h> 50 #include <sys/dmu.h> 51 #include <sys/dsl_prop.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_deleg.h> 54 #include <sys/spa.h> 55 #include <sys/zap.h> 56 #include <sys/sa.h> 57 #include <sys/sa_impl.h> 58 #include <sys/policy.h> 59 #include <sys/atomic.h> 60 #include <sys/zfs_ioctl.h> 61 #include <sys/zfs_ctldir.h> 62 #include <sys/zfs_fuid.h> 63 #include <sys/sunddi.h> 64 #include <sys/dmu_objset.h> 65 #include <sys/dsl_dir.h> 66 #include <sys/jail.h> 67 #include <sys/osd.h> 68 #include <ufs/ufs/quota.h> 69 #include <sys/zfs_quota.h> 70 71 #include "zfs_comutil.h" 72 73 #ifndef MNTK_VMSETSIZE_BUG 74 #define MNTK_VMSETSIZE_BUG 0 75 #endif 76 #ifndef MNTK_NOMSYNC 77 #define MNTK_NOMSYNC 8 78 #endif 79 80 struct mtx zfs_debug_mtx; 81 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); 82 83 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); 84 85 int zfs_super_owner; 86 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, 87 "File system owners can perform privileged operation on file systems"); 88 89 int zfs_debug_level; 90 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, 91 "Debug level"); 92 93 struct zfs_jailparam { 94 int mount_snapshot; 95 }; 96 97 static struct zfs_jailparam zfs_jailparam0 = { 98 .mount_snapshot = 0, 99 }; 100 101 static int zfs_jailparam_slot; 102 103 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); 104 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", 105 "Allow mounting snapshots in the .zfs directory for unjailed datasets"); 106 107 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); 108 static int zfs_version_acl = ZFS_ACL_VERSION; 109 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, 110 "ZFS_ACL_VERSION"); 111 static int zfs_version_spa = SPA_VERSION; 112 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, 113 "SPA_VERSION"); 114 static int zfs_version_zpl = ZPL_VERSION; 115 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, 116 "ZPL_VERSION"); 117 118 #if __FreeBSD_version >= 1400018 119 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, 120 bool *mp_busy); 121 #else 122 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); 123 #endif 124 static int zfs_mount(vfs_t *vfsp); 125 static int zfs_umount(vfs_t *vfsp, int fflag); 126 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); 127 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); 128 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); 129 static int zfs_sync(vfs_t *vfsp, int waitfor); 130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 131 struct ucred **credanonp, int *numsecflavors, int *secflavors); 132 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); 133 static void zfs_freevfs(vfs_t *vfsp); 134 135 struct vfsops zfs_vfsops = { 136 .vfs_mount = zfs_mount, 137 .vfs_unmount = zfs_umount, 138 .vfs_root = vfs_cache_root, 139 .vfs_cachedroot = zfs_root, 140 .vfs_statfs = zfs_statfs, 141 .vfs_vget = zfs_vget, 142 .vfs_sync = zfs_sync, 143 .vfs_checkexp = zfs_checkexp, 144 .vfs_fhtovp = zfs_fhtovp, 145 .vfs_quotactl = zfs_quotactl, 146 }; 147 148 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL 149 #ifdef VFCF_CROSS_COPY_FILE_RANGE 150 | VFCF_CROSS_COPY_FILE_RANGE 151 #endif 152 #ifdef VFCF_FILEREVINC 153 | VFCF_FILEREVINC 154 #endif 155 ); 156 157 /* 158 * We need to keep a count of active fs's. 159 * This is necessary to prevent our module 160 * from being unloaded after a umount -f 161 */ 162 static uint32_t zfs_active_fs_count = 0; 163 164 int 165 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 166 char *setpoint) 167 { 168 int error; 169 zfsvfs_t *zfvp; 170 vfs_t *vfsp; 171 objset_t *os; 172 uint64_t tmp = *val; 173 174 error = dmu_objset_from_ds(ds, &os); 175 if (error != 0) 176 return (error); 177 178 error = getzfsvfs_impl(os, &zfvp); 179 if (error != 0) 180 return (error); 181 if (zfvp == NULL) 182 return (ENOENT); 183 vfsp = zfvp->z_vfs; 184 switch (zfs_prop) { 185 case ZFS_PROP_ATIME: 186 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 187 tmp = 0; 188 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 189 tmp = 1; 190 break; 191 case ZFS_PROP_DEVICES: 192 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 193 tmp = 0; 194 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 195 tmp = 1; 196 break; 197 case ZFS_PROP_EXEC: 198 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 199 tmp = 0; 200 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 201 tmp = 1; 202 break; 203 case ZFS_PROP_SETUID: 204 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 205 tmp = 0; 206 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 207 tmp = 1; 208 break; 209 case ZFS_PROP_READONLY: 210 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 211 tmp = 0; 212 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 213 tmp = 1; 214 break; 215 case ZFS_PROP_XATTR: 216 if (zfvp->z_flags & ZSB_XATTR) 217 tmp = zfvp->z_xattr; 218 break; 219 case ZFS_PROP_NBMAND: 220 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 221 tmp = 0; 222 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 223 tmp = 1; 224 break; 225 default: 226 vfs_unbusy(vfsp); 227 return (ENOENT); 228 } 229 230 vfs_unbusy(vfsp); 231 if (tmp != *val) { 232 if (setpoint) 233 (void) strcpy(setpoint, "temporary"); 234 *val = tmp; 235 } 236 return (0); 237 } 238 239 static int 240 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) 241 { 242 int error = 0; 243 char buf[32]; 244 uint64_t usedobj, quotaobj, defaultquota; 245 uint64_t quota, used = 0; 246 timespec_t now; 247 248 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; 249 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; 250 defaultquota = isgroup ? zfsvfs->z_defaultgroupquota : 251 zfsvfs->z_defaultuserquota; 252 253 if (zfsvfs->z_replay) 254 return (ENOENT); 255 256 (void) sprintf(buf, "%llx", (longlong_t)id); 257 if (quotaobj == 0) { 258 if (defaultquota == 0) 259 return (ENOENT); 260 quota = defaultquota; 261 } else { 262 error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 263 1, "a); 264 if (error && (quota = defaultquota) == 0) 265 return (error); 266 } 267 268 /* 269 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". 270 * So we set them to be the same. 271 */ 272 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); 273 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); 274 if (error == ENOENT) 275 error = 0; 276 if (error) 277 return (error); 278 dqp->dqb_curblocks = btodb(used); 279 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; 280 vfs_timestamp(&now); 281 /* 282 * Setting this to 0 causes FreeBSD quota(8) to print 283 * the number of days since the epoch, which isn't 284 * particularly useful. 285 */ 286 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; 287 return (error); 288 } 289 290 static int 291 #if __FreeBSD_version >= 1400018 292 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy) 293 #else 294 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) 295 #endif 296 { 297 zfsvfs_t *zfsvfs = vfsp->vfs_data; 298 struct thread *td; 299 int cmd, type, error = 0; 300 int bitsize; 301 zfs_userquota_prop_t quota_type; 302 struct dqblk64 dqblk = { 0 }; 303 304 td = curthread; 305 cmd = cmds >> SUBCMDSHIFT; 306 type = cmds & SUBCMDMASK; 307 308 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 309 return (error); 310 if (id == -1) { 311 switch (type) { 312 case USRQUOTA: 313 id = td->td_ucred->cr_ruid; 314 break; 315 case GRPQUOTA: 316 id = td->td_ucred->cr_rgid; 317 break; 318 default: 319 error = EINVAL; 320 #if __FreeBSD_version < 1400018 321 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) 322 vfs_unbusy(vfsp); 323 #endif 324 goto done; 325 } 326 } 327 /* 328 * Map BSD type to: 329 * ZFS_PROP_USERUSED, 330 * ZFS_PROP_USERQUOTA, 331 * ZFS_PROP_GROUPUSED, 332 * ZFS_PROP_GROUPQUOTA 333 */ 334 switch (cmd) { 335 case Q_SETQUOTA: 336 case Q_SETQUOTA32: 337 if (type == USRQUOTA) 338 quota_type = ZFS_PROP_USERQUOTA; 339 else if (type == GRPQUOTA) 340 quota_type = ZFS_PROP_GROUPQUOTA; 341 else 342 error = EINVAL; 343 break; 344 case Q_GETQUOTA: 345 case Q_GETQUOTA32: 346 if (type == USRQUOTA) 347 quota_type = ZFS_PROP_USERUSED; 348 else if (type == GRPQUOTA) 349 quota_type = ZFS_PROP_GROUPUSED; 350 else 351 error = EINVAL; 352 break; 353 } 354 355 /* 356 * Depending on the cmd, we may need to get 357 * the ruid and domain (see fuidstr_to_sid?), 358 * the fuid (how?), or other information. 359 * Create fuid using zfs_fuid_create(zfsvfs, id, 360 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? 361 * I think I can use just the id? 362 * 363 * Look at zfs_id_overquota() to look up a quota. 364 * zap_lookup(something, quotaobj, fuidstring, 365 * sizeof (long long), 1, "a) 366 * 367 * See zfs_set_userquota() to set a quota. 368 */ 369 if ((uint32_t)type >= MAXQUOTAS) { 370 error = EINVAL; 371 goto done; 372 } 373 374 switch (cmd) { 375 case Q_GETQUOTASIZE: 376 bitsize = 64; 377 error = copyout(&bitsize, arg, sizeof (int)); 378 break; 379 case Q_QUOTAON: 380 // As far as I can tell, you can't turn quotas on or off on zfs 381 error = 0; 382 #if __FreeBSD_version < 1400018 383 vfs_unbusy(vfsp); 384 #endif 385 break; 386 case Q_QUOTAOFF: 387 error = ENOTSUP; 388 #if __FreeBSD_version < 1400018 389 vfs_unbusy(vfsp); 390 #endif 391 break; 392 case Q_SETQUOTA: 393 error = copyin(arg, &dqblk, sizeof (dqblk)); 394 if (error == 0) 395 error = zfs_set_userquota(zfsvfs, quota_type, 396 "", id, dbtob(dqblk.dqb_bhardlimit)); 397 break; 398 case Q_GETQUOTA: 399 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); 400 if (error == 0) 401 error = copyout(&dqblk, arg, sizeof (dqblk)); 402 break; 403 default: 404 error = EINVAL; 405 break; 406 } 407 done: 408 zfs_exit(zfsvfs, FTAG); 409 return (error); 410 } 411 412 413 boolean_t 414 zfs_is_readonly(zfsvfs_t *zfsvfs) 415 { 416 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); 417 } 418 419 static int 420 zfs_sync(vfs_t *vfsp, int waitfor) 421 { 422 423 /* 424 * Data integrity is job one. We don't want a compromised kernel 425 * writing to the storage pool, so we never sync during panic. 426 */ 427 if (panicstr) 428 return (0); 429 430 /* 431 * Ignore the system syncher. ZFS already commits async data 432 * at zfs_txg_timeout intervals. 433 */ 434 if (waitfor == MNT_LAZY) 435 return (0); 436 437 if (vfsp != NULL) { 438 /* 439 * Sync a specific filesystem. 440 */ 441 zfsvfs_t *zfsvfs = vfsp->vfs_data; 442 dsl_pool_t *dp; 443 int error; 444 445 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 446 return (error); 447 dp = dmu_objset_pool(zfsvfs->z_os); 448 449 /* 450 * If the system is shutting down, then skip any 451 * filesystems which may exist on a suspended pool. 452 */ 453 if (rebooting && spa_suspended(dp->dp_spa)) { 454 zfs_exit(zfsvfs, FTAG); 455 return (0); 456 } 457 458 if (zfsvfs->z_log != NULL) { 459 error = zil_commit(zfsvfs->z_log, 0); 460 if (error != 0) { 461 zfs_exit(zfsvfs, FTAG); 462 return (error); 463 } 464 } 465 466 zfs_exit(zfsvfs, FTAG); 467 } else { 468 /* 469 * Sync all ZFS filesystems. This is what happens when you 470 * run sync(8). Unlike other filesystems, ZFS honors the 471 * request by waiting for all pools to commit all dirty data. 472 */ 473 spa_sync_allpools(); 474 } 475 476 return (0); 477 } 478 479 static void 480 atime_changed_cb(void *arg, uint64_t newval) 481 { 482 zfsvfs_t *zfsvfs = arg; 483 484 if (newval == TRUE) { 485 zfsvfs->z_atime = TRUE; 486 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; 487 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 488 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 489 } else { 490 zfsvfs->z_atime = FALSE; 491 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; 492 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 493 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 494 } 495 } 496 497 static void 498 xattr_changed_cb(void *arg, uint64_t newval) 499 { 500 zfsvfs_t *zfsvfs = arg; 501 502 if (newval == ZFS_XATTR_OFF) { 503 zfsvfs->z_flags &= ~ZSB_XATTR; 504 } else { 505 zfsvfs->z_flags |= ZSB_XATTR; 506 507 if (newval == ZFS_XATTR_SA) 508 zfsvfs->z_xattr_sa = B_TRUE; 509 else 510 zfsvfs->z_xattr_sa = B_FALSE; 511 } 512 } 513 514 static void 515 blksz_changed_cb(void *arg, uint64_t newval) 516 { 517 zfsvfs_t *zfsvfs = arg; 518 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 519 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 520 ASSERT(ISP2(newval)); 521 522 zfsvfs->z_max_blksz = newval; 523 zfsvfs->z_vfs->mnt_stat.f_iosize = newval; 524 } 525 526 static void 527 readonly_changed_cb(void *arg, uint64_t newval) 528 { 529 zfsvfs_t *zfsvfs = arg; 530 531 if (newval) { 532 /* XXX locking on vfs_flag? */ 533 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 534 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 535 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 536 } else { 537 /* XXX locking on vfs_flag? */ 538 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 539 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 540 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 541 } 542 } 543 544 static void 545 setuid_changed_cb(void *arg, uint64_t newval) 546 { 547 zfsvfs_t *zfsvfs = arg; 548 549 if (newval == FALSE) { 550 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 551 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 552 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 553 } else { 554 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 555 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 556 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 557 } 558 } 559 560 static void 561 exec_changed_cb(void *arg, uint64_t newval) 562 { 563 zfsvfs_t *zfsvfs = arg; 564 565 if (newval == FALSE) { 566 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 567 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 568 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 569 } else { 570 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 571 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 572 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 573 } 574 } 575 576 /* 577 * The nbmand mount option can be changed at mount time. 578 * We can't allow it to be toggled on live file systems or incorrect 579 * behavior may be seen from cifs clients 580 * 581 * This property isn't registered via dsl_prop_register(), but this callback 582 * will be called when a file system is first mounted 583 */ 584 static void 585 nbmand_changed_cb(void *arg, uint64_t newval) 586 { 587 zfsvfs_t *zfsvfs = arg; 588 if (newval == FALSE) { 589 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 590 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 591 } else { 592 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 593 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 594 } 595 } 596 597 static void 598 snapdir_changed_cb(void *arg, uint64_t newval) 599 { 600 zfsvfs_t *zfsvfs = arg; 601 602 zfsvfs->z_show_ctldir = newval; 603 } 604 605 static void 606 acl_mode_changed_cb(void *arg, uint64_t newval) 607 { 608 zfsvfs_t *zfsvfs = arg; 609 610 zfsvfs->z_acl_mode = newval; 611 } 612 613 static void 614 acl_inherit_changed_cb(void *arg, uint64_t newval) 615 { 616 zfsvfs_t *zfsvfs = arg; 617 618 zfsvfs->z_acl_inherit = newval; 619 } 620 621 static void 622 acl_type_changed_cb(void *arg, uint64_t newval) 623 { 624 zfsvfs_t *zfsvfs = arg; 625 626 zfsvfs->z_acl_type = newval; 627 } 628 629 static void 630 longname_changed_cb(void *arg, uint64_t newval) 631 { 632 zfsvfs_t *zfsvfs = arg; 633 634 zfsvfs->z_longname = newval; 635 } 636 637 static int 638 zfs_register_callbacks(vfs_t *vfsp) 639 { 640 struct dsl_dataset *ds = NULL; 641 objset_t *os = NULL; 642 zfsvfs_t *zfsvfs = NULL; 643 uint64_t nbmand; 644 boolean_t readonly = B_FALSE; 645 boolean_t do_readonly = B_FALSE; 646 boolean_t setuid = B_FALSE; 647 boolean_t do_setuid = B_FALSE; 648 boolean_t exec = B_FALSE; 649 boolean_t do_exec = B_FALSE; 650 boolean_t xattr = B_FALSE; 651 boolean_t atime = B_FALSE; 652 boolean_t do_atime = B_FALSE; 653 boolean_t do_xattr = B_FALSE; 654 int error = 0; 655 656 ASSERT3P(vfsp, !=, NULL); 657 zfsvfs = vfsp->vfs_data; 658 ASSERT3P(zfsvfs, !=, NULL); 659 os = zfsvfs->z_os; 660 661 /* 662 * This function can be called for a snapshot when we update snapshot's 663 * mount point, which isn't really supported. 664 */ 665 if (dmu_objset_is_snapshot(os)) 666 return (EOPNOTSUPP); 667 668 /* 669 * The act of registering our callbacks will destroy any mount 670 * options we may have. In order to enable temporary overrides 671 * of mount options, we stash away the current values and 672 * restore them after we register the callbacks. 673 */ 674 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 675 !spa_writeable(dmu_objset_spa(os))) { 676 readonly = B_TRUE; 677 do_readonly = B_TRUE; 678 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 679 readonly = B_FALSE; 680 do_readonly = B_TRUE; 681 } 682 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 683 setuid = B_FALSE; 684 do_setuid = B_TRUE; 685 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 686 setuid = B_TRUE; 687 do_setuid = B_TRUE; 688 } 689 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 690 exec = B_FALSE; 691 do_exec = B_TRUE; 692 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 693 exec = B_TRUE; 694 do_exec = B_TRUE; 695 } 696 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 697 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; 698 do_xattr = B_TRUE; 699 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 700 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 701 do_xattr = B_TRUE; 702 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { 703 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; 704 do_xattr = B_TRUE; 705 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { 706 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; 707 do_xattr = B_TRUE; 708 } 709 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 710 atime = B_FALSE; 711 do_atime = B_TRUE; 712 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 713 atime = B_TRUE; 714 do_atime = B_TRUE; 715 } 716 717 /* 718 * We need to enter pool configuration here, so that we can use 719 * dsl_prop_get_int_ds() to handle the special nbmand property below. 720 * dsl_prop_get_integer() can not be used, because it has to acquire 721 * spa_namespace_lock and we can not do that because we already hold 722 * z_teardown_lock. The problem is that spa_write_cachefile() is called 723 * with spa_namespace_lock held and the function calls ZFS vnode 724 * operations to write the cache file and thus z_teardown_lock is 725 * acquired after spa_namespace_lock. 726 */ 727 ds = dmu_objset_ds(os); 728 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 729 730 /* 731 * nbmand is a special property. It can only be changed at 732 * mount time. 733 * 734 * This is weird, but it is documented to only be changeable 735 * at mount time. 736 */ 737 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 738 nbmand = B_FALSE; 739 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 740 nbmand = B_TRUE; 741 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) { 742 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 743 return (error); 744 } 745 746 /* 747 * Register property callbacks. 748 * 749 * It would probably be fine to just check for i/o error from 750 * the first prop_register(), but I guess I like to go 751 * overboard... 752 */ 753 error = dsl_prop_register(ds, 754 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 755 error = error ? error : dsl_prop_register(ds, 756 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 757 error = error ? error : dsl_prop_register(ds, 758 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 759 error = error ? error : dsl_prop_register(ds, 760 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 761 error = error ? error : dsl_prop_register(ds, 762 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 763 error = error ? error : dsl_prop_register(ds, 764 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 765 error = error ? error : dsl_prop_register(ds, 766 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 767 error = error ? error : dsl_prop_register(ds, 768 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); 769 error = error ? error : dsl_prop_register(ds, 770 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 771 error = error ? error : dsl_prop_register(ds, 772 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 773 zfsvfs); 774 error = error ? error : dsl_prop_register(ds, 775 zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs); 776 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 777 if (error) 778 goto unregister; 779 780 /* 781 * Invoke our callbacks to restore temporary mount options. 782 */ 783 if (do_readonly) 784 readonly_changed_cb(zfsvfs, readonly); 785 if (do_setuid) 786 setuid_changed_cb(zfsvfs, setuid); 787 if (do_exec) 788 exec_changed_cb(zfsvfs, exec); 789 if (do_xattr) 790 xattr_changed_cb(zfsvfs, xattr); 791 if (do_atime) 792 atime_changed_cb(zfsvfs, atime); 793 794 nbmand_changed_cb(zfsvfs, nbmand); 795 796 return (0); 797 798 unregister: 799 dsl_prop_unregister_all(ds, zfsvfs); 800 return (error); 801 } 802 803 /* 804 * Associate this zfsvfs with the given objset, which must be owned. 805 * This will cache a bunch of on-disk state from the objset in the 806 * zfsvfs. 807 */ 808 static int 809 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 810 { 811 int error; 812 uint64_t val; 813 814 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 815 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 816 zfsvfs->z_os = os; 817 818 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 819 if (error != 0) 820 return (error); 821 if (zfsvfs->z_version > 822 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 823 (void) printf("Can't mount a version %lld file system " 824 "on a version %lld pool\n. Pool must be upgraded to mount " 825 "this file system.", (u_longlong_t)zfsvfs->z_version, 826 (u_longlong_t)spa_version(dmu_objset_spa(os))); 827 return (SET_ERROR(ENOTSUP)); 828 } 829 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 830 if (error != 0) 831 return (error); 832 zfsvfs->z_norm = (int)val; 833 834 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 835 if (error != 0) 836 return (error); 837 zfsvfs->z_utf8 = (val != 0); 838 839 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 840 if (error != 0) 841 return (error); 842 zfsvfs->z_case = (uint_t)val; 843 844 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); 845 if (error != 0) 846 return (error); 847 zfsvfs->z_acl_type = (uint_t)val; 848 849 /* 850 * Fold case on file systems that are always or sometimes case 851 * insensitive. 852 */ 853 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 854 zfsvfs->z_case == ZFS_CASE_MIXED) 855 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 856 857 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 858 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 859 860 uint64_t sa_obj = 0; 861 if (zfsvfs->z_use_sa) { 862 /* should either have both of these objects or none */ 863 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 864 &sa_obj); 865 if (error != 0) 866 return (error); 867 868 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); 869 if (error == 0 && val == ZFS_XATTR_SA) 870 zfsvfs->z_xattr_sa = B_TRUE; 871 } 872 873 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA, 874 &zfsvfs->z_defaultuserquota); 875 if (error != 0) 876 return (error); 877 878 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA, 879 &zfsvfs->z_defaultgroupquota); 880 if (error != 0) 881 return (error); 882 883 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA, 884 &zfsvfs->z_defaultprojectquota); 885 if (error != 0) 886 return (error); 887 888 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA, 889 &zfsvfs->z_defaultuserobjquota); 890 if (error != 0) 891 return (error); 892 893 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA, 894 &zfsvfs->z_defaultgroupobjquota); 895 if (error != 0) 896 return (error); 897 898 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA, 899 &zfsvfs->z_defaultprojectobjquota); 900 if (error != 0) 901 return (error); 902 903 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 904 &zfsvfs->z_attr_table); 905 if (error != 0) 906 return (error); 907 908 if (zfsvfs->z_version >= ZPL_VERSION_SA) 909 sa_register_update_callback(os, zfs_sa_upgrade); 910 911 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 912 &zfsvfs->z_root); 913 if (error != 0) 914 return (error); 915 ASSERT3U(zfsvfs->z_root, !=, 0); 916 917 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 918 &zfsvfs->z_unlinkedobj); 919 if (error != 0) 920 return (error); 921 922 error = zap_lookup(os, MASTER_NODE_OBJ, 923 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 924 8, 1, &zfsvfs->z_userquota_obj); 925 if (error == ENOENT) 926 zfsvfs->z_userquota_obj = 0; 927 else if (error != 0) 928 return (error); 929 930 error = zap_lookup(os, MASTER_NODE_OBJ, 931 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 932 8, 1, &zfsvfs->z_groupquota_obj); 933 if (error == ENOENT) 934 zfsvfs->z_groupquota_obj = 0; 935 else if (error != 0) 936 return (error); 937 938 error = zap_lookup(os, MASTER_NODE_OBJ, 939 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 940 8, 1, &zfsvfs->z_projectquota_obj); 941 if (error == ENOENT) 942 zfsvfs->z_projectquota_obj = 0; 943 else if (error != 0) 944 return (error); 945 946 error = zap_lookup(os, MASTER_NODE_OBJ, 947 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 948 8, 1, &zfsvfs->z_userobjquota_obj); 949 if (error == ENOENT) 950 zfsvfs->z_userobjquota_obj = 0; 951 else if (error != 0) 952 return (error); 953 954 error = zap_lookup(os, MASTER_NODE_OBJ, 955 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 956 8, 1, &zfsvfs->z_groupobjquota_obj); 957 if (error == ENOENT) 958 zfsvfs->z_groupobjquota_obj = 0; 959 else if (error != 0) 960 return (error); 961 962 error = zap_lookup(os, MASTER_NODE_OBJ, 963 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 964 8, 1, &zfsvfs->z_projectobjquota_obj); 965 if (error == ENOENT) 966 zfsvfs->z_projectobjquota_obj = 0; 967 else if (error != 0) 968 return (error); 969 970 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 971 &zfsvfs->z_fuid_obj); 972 if (error == ENOENT) 973 zfsvfs->z_fuid_obj = 0; 974 else if (error != 0) 975 return (error); 976 977 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 978 &zfsvfs->z_shares_dir); 979 if (error == ENOENT) 980 zfsvfs->z_shares_dir = 0; 981 else if (error != 0) 982 return (error); 983 984 /* 985 * Only use the name cache if we are looking for a 986 * name on a file system that does not require normalization 987 * or case folding. We can also look there if we happen to be 988 * on a non-normalizing, mixed sensitivity file system IF we 989 * are looking for the exact name (which is always the case on 990 * FreeBSD). 991 */ 992 zfsvfs->z_use_namecache = !zfsvfs->z_norm || 993 ((zfsvfs->z_case == ZFS_CASE_MIXED) && 994 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); 995 996 return (0); 997 } 998 999 taskq_t *zfsvfs_taskq; 1000 1001 static void 1002 zfsvfs_task_unlinked_drain(void *context, int pending __unused) 1003 { 1004 1005 zfs_unlinked_drain((zfsvfs_t *)context); 1006 } 1007 1008 int 1009 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 1010 { 1011 objset_t *os; 1012 zfsvfs_t *zfsvfs; 1013 int error; 1014 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 1015 1016 /* 1017 * XXX: Fix struct statfs so this isn't necessary! 1018 * 1019 * The 'osname' is used as the filesystem's special node, which means 1020 * it must fit in statfs.f_mntfromname, or else it can't be 1021 * enumerated, so libzfs_mnttab_find() returns NULL, which causes 1022 * 'zfs unmount' to think it's not mounted when it is. 1023 */ 1024 if (strlen(osname) >= MNAMELEN) 1025 return (SET_ERROR(ENAMETOOLONG)); 1026 1027 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1028 1029 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, 1030 &os); 1031 if (error != 0) { 1032 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1033 return (error); 1034 } 1035 1036 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 1037 1038 return (error); 1039 } 1040 1041 1042 int 1043 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1044 { 1045 int error; 1046 1047 zfsvfs->z_vfs = NULL; 1048 zfsvfs->z_parent = zfsvfs; 1049 1050 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1051 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1052 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1053 offsetof(znode_t, z_link_node)); 1054 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, 1055 zfsvfs_task_unlinked_drain, zfsvfs); 1056 ZFS_TEARDOWN_INIT(zfsvfs); 1057 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); 1058 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1059 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1060 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1061 1062 error = zfsvfs_init(zfsvfs, os); 1063 if (error != 0) { 1064 dmu_objset_disown(os, B_TRUE, zfsvfs); 1065 *zfvp = NULL; 1066 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1067 return (error); 1068 } 1069 1070 *zfvp = zfsvfs; 1071 return (0); 1072 } 1073 1074 static int 1075 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1076 { 1077 int error; 1078 1079 /* 1080 * Check for a bad on-disk format version now since we 1081 * lied about owning the dataset readonly before. 1082 */ 1083 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && 1084 dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) 1085 return (SET_ERROR(EROFS)); 1086 1087 error = zfs_register_callbacks(zfsvfs->z_vfs); 1088 if (error) 1089 return (error); 1090 1091 /* 1092 * If we are not mounting (ie: online recv), then we don't 1093 * have to worry about replaying the log as we blocked all 1094 * operations out since we closed the ZIL. 1095 */ 1096 if (mounting) { 1097 boolean_t readonly; 1098 1099 ASSERT0P(zfsvfs->z_kstat.dk_kstats); 1100 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); 1101 if (error) 1102 return (error); 1103 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1104 &zfsvfs->z_kstat.dk_zil_sums); 1105 1106 /* 1107 * During replay we remove the read only flag to 1108 * allow replays to succeed. 1109 */ 1110 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1111 if (readonly != 0) { 1112 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1113 } else { 1114 dsl_dir_t *dd; 1115 zap_stats_t zs; 1116 1117 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 1118 &zs) == 0) { 1119 dataset_kstats_update_nunlinks_kstat( 1120 &zfsvfs->z_kstat, zs.zs_num_entries); 1121 dprintf_ds(zfsvfs->z_os->os_dsl_dataset, 1122 "num_entries in unlinked set: %llu", 1123 (u_longlong_t)zs.zs_num_entries); 1124 } 1125 1126 zfs_unlinked_drain(zfsvfs); 1127 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1128 dd->dd_activity_cancelled = B_FALSE; 1129 } 1130 1131 /* 1132 * Parse and replay the intent log. 1133 * 1134 * Because of ziltest, this must be done after 1135 * zfs_unlinked_drain(). (Further note: ziltest 1136 * doesn't use readonly mounts, where 1137 * zfs_unlinked_drain() isn't called.) This is because 1138 * ziltest causes spa_sync() to think it's committed, 1139 * but actually it is not, so the intent log contains 1140 * many txg's worth of changes. 1141 * 1142 * In particular, if object N is in the unlinked set in 1143 * the last txg to actually sync, then it could be 1144 * actually freed in a later txg and then reallocated 1145 * in a yet later txg. This would write a "create 1146 * object N" record to the intent log. Normally, this 1147 * would be fine because the spa_sync() would have 1148 * written out the fact that object N is free, before 1149 * we could write the "create object N" intent log 1150 * record. 1151 * 1152 * But when we are in ziltest mode, we advance the "open 1153 * txg" without actually spa_sync()-ing the changes to 1154 * disk. So we would see that object N is still 1155 * allocated and in the unlinked set, and there is an 1156 * intent log record saying to allocate it. 1157 */ 1158 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1159 if (zil_replay_disable) { 1160 zil_destroy(zfsvfs->z_log, B_FALSE); 1161 } else { 1162 boolean_t use_nc = zfsvfs->z_use_namecache; 1163 zfsvfs->z_use_namecache = B_FALSE; 1164 zfsvfs->z_replay = B_TRUE; 1165 zil_replay(zfsvfs->z_os, zfsvfs, 1166 zfs_replay_vector); 1167 zfsvfs->z_replay = B_FALSE; 1168 zfsvfs->z_use_namecache = use_nc; 1169 } 1170 } 1171 1172 /* restore readonly bit */ 1173 if (readonly != 0) 1174 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1175 } else { 1176 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL); 1177 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data, 1178 &zfsvfs->z_kstat.dk_zil_sums); 1179 } 1180 1181 /* 1182 * Set the objset user_ptr to track its zfsvfs. 1183 */ 1184 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1185 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1186 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1187 1188 return (0); 1189 } 1190 1191 void 1192 zfsvfs_free(zfsvfs_t *zfsvfs) 1193 { 1194 int i; 1195 1196 zfs_fuid_destroy(zfsvfs); 1197 1198 mutex_destroy(&zfsvfs->z_znodes_lock); 1199 mutex_destroy(&zfsvfs->z_lock); 1200 list_destroy(&zfsvfs->z_all_znodes); 1201 ZFS_TEARDOWN_DESTROY(zfsvfs); 1202 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); 1203 rw_destroy(&zfsvfs->z_fuid_lock); 1204 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1205 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1206 dataset_kstats_destroy(&zfsvfs->z_kstat); 1207 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1208 } 1209 1210 static void 1211 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1212 { 1213 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1214 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1215 } 1216 1217 extern int zfs_xattr_compat; 1218 1219 static int 1220 zfs_domount(vfs_t *vfsp, char *osname) 1221 { 1222 uint64_t recordsize, fsid_guid; 1223 int error = 0; 1224 zfsvfs_t *zfsvfs; 1225 1226 ASSERT3P(vfsp, !=, NULL); 1227 ASSERT3P(osname, !=, NULL); 1228 1229 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); 1230 if (error) 1231 return (error); 1232 zfsvfs->z_vfs = vfsp; 1233 1234 if ((error = dsl_prop_get_integer(osname, 1235 "recordsize", &recordsize, NULL))) 1236 goto out; 1237 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; 1238 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; 1239 1240 vfsp->vfs_data = zfsvfs; 1241 vfsp->mnt_flag |= MNT_LOCAL; 1242 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; 1243 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; 1244 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; 1245 /* 1246 * This can cause a loss of coherence between ARC and page cache 1247 * on ZoF - unclear if the problem is in FreeBSD or ZoF 1248 */ 1249 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ 1250 vfsp->mnt_kern_flag |= MNTK_NOMSYNC; 1251 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; 1252 1253 #if defined(_KERNEL) && !defined(KMEM_DEBUG) 1254 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; 1255 #endif 1256 /* 1257 * The fsid is 64 bits, composed of an 8-bit fs type, which 1258 * separates our fsid from any other filesystem types, and a 1259 * 56-bit objset unique ID. The objset unique ID is unique to 1260 * all objsets open on this system, provided by unique_create(). 1261 * The 8-bit fs type must be put in the low bits of fsid[1] 1262 * because that's where other Solaris filesystems put it. 1263 */ 1264 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1265 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0); 1266 vfsp->vfs_fsid.val[0] = fsid_guid; 1267 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) | 1268 (vfsp->mnt_vfc->vfc_typenum & 0xFF); 1269 1270 /* 1271 * Set features for file system. 1272 */ 1273 zfs_set_fuid_feature(zfsvfs); 1274 1275 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1276 uint64_t pval; 1277 1278 atime_changed_cb(zfsvfs, B_FALSE); 1279 readonly_changed_cb(zfsvfs, B_TRUE); 1280 if ((error = dsl_prop_get_integer(osname, 1281 "xattr", &pval, NULL))) 1282 goto out; 1283 xattr_changed_cb(zfsvfs, pval); 1284 if ((error = dsl_prop_get_integer(osname, 1285 "acltype", &pval, NULL))) 1286 goto out; 1287 acl_type_changed_cb(zfsvfs, pval); 1288 zfsvfs->z_issnap = B_TRUE; 1289 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1290 1291 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1292 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1293 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1294 } else { 1295 if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) 1296 goto out; 1297 } 1298 1299 #if __FreeBSD_version >= 1500040 1300 /* 1301 * Named attributes can only work if the xattr property is set to 1302 * on/dir and not sa. Also, zfs_xattr_compat must be set. 1303 */ 1304 if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa && 1305 zfs_xattr_compat) 1306 vfsp->mnt_flag |= MNT_NAMEDATTR; 1307 #endif 1308 1309 vfs_mountedfrom(vfsp, osname); 1310 1311 if (!zfsvfs->z_issnap) 1312 zfsctl_create(zfsvfs); 1313 out: 1314 if (error) { 1315 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1316 zfsvfs_free(zfsvfs); 1317 } else { 1318 atomic_inc_32(&zfs_active_fs_count); 1319 } 1320 1321 return (error); 1322 } 1323 1324 static void 1325 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1326 { 1327 objset_t *os = zfsvfs->z_os; 1328 1329 if (!dmu_objset_is_snapshot(os)) 1330 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1331 } 1332 1333 static int 1334 getpoolname(const char *osname, char *poolname) 1335 { 1336 char *p; 1337 1338 p = strchr(osname, '/'); 1339 if (p == NULL) { 1340 if (strlen(osname) >= MAXNAMELEN) 1341 return (ENAMETOOLONG); 1342 (void) strcpy(poolname, osname); 1343 } else { 1344 if (p - osname >= MAXNAMELEN) 1345 return (ENAMETOOLONG); 1346 (void) strlcpy(poolname, osname, p - osname + 1); 1347 } 1348 return (0); 1349 } 1350 1351 static void 1352 fetch_osname_options(char *name, bool *checkpointrewind) 1353 { 1354 1355 if (name[0] == '!') { 1356 *checkpointrewind = true; 1357 memmove(name, name + 1, strlen(name)); 1358 } else { 1359 *checkpointrewind = false; 1360 } 1361 } 1362 1363 static int 1364 zfs_mount(vfs_t *vfsp) 1365 { 1366 kthread_t *td = curthread; 1367 vnode_t *mvp = vfsp->mnt_vnodecovered; 1368 cred_t *cr = td->td_ucred; 1369 char *osname; 1370 int error = 0; 1371 int canwrite; 1372 bool checkpointrewind, isctlsnap = false; 1373 1374 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) 1375 return (SET_ERROR(EINVAL)); 1376 1377 /* 1378 * If full-owner-access is enabled and delegated administration is 1379 * turned on, we must set nosuid. 1380 */ 1381 if (zfs_super_owner && 1382 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { 1383 secpolicy_fs_mount_clearopts(cr, vfsp); 1384 } 1385 1386 fetch_osname_options(osname, &checkpointrewind); 1387 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) && 1388 strchr(osname, '@') != NULL); 1389 1390 /* 1391 * Check for mount privilege? 1392 * 1393 * If we don't have privilege then see if 1394 * we have local permission to allow it 1395 */ 1396 error = secpolicy_fs_mount(cr, mvp, vfsp); 1397 if (error && isctlsnap) { 1398 secpolicy_fs_mount_clearopts(cr, vfsp); 1399 } else if (error) { 1400 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) 1401 goto out; 1402 1403 if (!(vfsp->vfs_flag & MS_REMOUNT)) { 1404 vattr_t vattr; 1405 1406 /* 1407 * Make sure user is the owner of the mount point 1408 * or has sufficient privileges. 1409 */ 1410 1411 vattr.va_mask = AT_UID; 1412 1413 vn_lock(mvp, LK_SHARED | LK_RETRY); 1414 if (VOP_GETATTR(mvp, &vattr, cr)) { 1415 VOP_UNLOCK(mvp); 1416 goto out; 1417 } 1418 1419 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && 1420 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { 1421 VOP_UNLOCK(mvp); 1422 goto out; 1423 } 1424 VOP_UNLOCK(mvp); 1425 } 1426 1427 secpolicy_fs_mount_clearopts(cr, vfsp); 1428 } 1429 1430 /* 1431 * Refuse to mount a filesystem if we are in a local zone and the 1432 * dataset is not visible. 1433 */ 1434 if (!INGLOBALZONE(curproc) && 1435 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1436 boolean_t mount_snapshot = B_FALSE; 1437 1438 /* 1439 * Snapshots may be mounted in .zfs for unjailed datasets 1440 * if allowed by the jail param zfs.mount_snapshot. 1441 */ 1442 if (isctlsnap) { 1443 struct prison *pr; 1444 struct zfs_jailparam *zjp; 1445 1446 pr = curthread->td_ucred->cr_prison; 1447 mtx_lock(&pr->pr_mtx); 1448 zjp = osd_jail_get(pr, zfs_jailparam_slot); 1449 mtx_unlock(&pr->pr_mtx); 1450 if (zjp && zjp->mount_snapshot) 1451 mount_snapshot = B_TRUE; 1452 } 1453 if (!mount_snapshot) { 1454 error = SET_ERROR(EPERM); 1455 goto out; 1456 } 1457 } 1458 1459 vfsp->vfs_flag |= MNT_NFS4ACLS; 1460 1461 /* 1462 * When doing a remount, we simply refresh our temporary properties 1463 * according to those options set in the current VFS options. 1464 */ 1465 if (vfsp->vfs_flag & MS_REMOUNT) { 1466 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1467 1468 /* 1469 * Refresh mount options with z_teardown_lock blocking I/O while 1470 * the filesystem is in an inconsistent state. 1471 * The lock also serializes this code with filesystem 1472 * manipulations between entry to zfs_suspend_fs() and return 1473 * from zfs_resume_fs(). 1474 */ 1475 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1476 zfs_unregister_callbacks(zfsvfs); 1477 error = zfs_register_callbacks(vfsp); 1478 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1479 goto out; 1480 } 1481 1482 /* Initial root mount: try hard to import the requested root pool. */ 1483 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && 1484 (vfsp->vfs_flag & MNT_UPDATE) == 0) { 1485 char pname[MAXNAMELEN]; 1486 1487 error = getpoolname(osname, pname); 1488 if (error == 0) 1489 error = spa_import_rootpool(pname, checkpointrewind); 1490 if (error) 1491 goto out; 1492 } 1493 DROP_GIANT(); 1494 error = zfs_domount(vfsp, osname); 1495 PICKUP_GIANT(); 1496 1497 out: 1498 return (error); 1499 } 1500 1501 static int 1502 zfs_statfs(vfs_t *vfsp, struct statfs *statp) 1503 { 1504 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1505 uint64_t refdbytes, availbytes, usedobjs, availobjs; 1506 int error; 1507 1508 statp->f_version = STATFS_VERSION; 1509 1510 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1511 return (error); 1512 1513 dmu_objset_space(zfsvfs->z_os, 1514 &refdbytes, &availbytes, &usedobjs, &availobjs); 1515 1516 /* 1517 * The underlying storage pool actually uses multiple block sizes. 1518 * We report the fragsize as the smallest block size we support, 1519 * and we report our blocksize as the filesystem's maximum blocksize. 1520 */ 1521 statp->f_bsize = SPA_MINBLOCKSIZE; 1522 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; 1523 1524 /* 1525 * The following report "total" blocks of various kinds in the 1526 * file system, but reported in terms of f_frsize - the 1527 * "fragment" size. 1528 */ 1529 1530 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 1531 statp->f_bfree = availbytes / statp->f_bsize; 1532 statp->f_bavail = statp->f_bfree; /* no root reservation */ 1533 1534 /* 1535 * statvfs() should really be called statufs(), because it assumes 1536 * static metadata. ZFS doesn't preallocate files, so the best 1537 * we can do is report the max that could possibly fit in f_files, 1538 * and that minus the number actually used in f_ffree. 1539 * For f_ffree, report the smaller of the number of object available 1540 * and the number of blocks (each object will take at least a block). 1541 */ 1542 statp->f_ffree = MIN(availobjs, statp->f_bfree); 1543 statp->f_files = statp->f_ffree + usedobjs; 1544 1545 /* 1546 * We're a zfs filesystem. 1547 */ 1548 strlcpy(statp->f_fstypename, "zfs", 1549 sizeof (statp->f_fstypename)); 1550 1551 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, 1552 sizeof (statp->f_mntfromname)); 1553 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, 1554 sizeof (statp->f_mntonname)); 1555 1556 statp->f_namemax = 1557 zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1); 1558 1559 zfs_exit(zfsvfs, FTAG); 1560 return (0); 1561 } 1562 1563 static int 1564 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) 1565 { 1566 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1567 znode_t *rootzp; 1568 int error; 1569 1570 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1571 return (error); 1572 1573 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); 1574 if (error == 0) 1575 *vpp = ZTOV(rootzp); 1576 1577 zfs_exit(zfsvfs, FTAG); 1578 1579 if (error == 0) { 1580 error = vn_lock(*vpp, flags); 1581 if (error != 0) { 1582 VN_RELE(*vpp); 1583 *vpp = NULL; 1584 } 1585 } 1586 return (error); 1587 } 1588 1589 /* 1590 * Teardown the zfsvfs::z_os. 1591 * 1592 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 1593 * and 'z_teardown_inactive_lock' held. 1594 */ 1595 static int 1596 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 1597 { 1598 znode_t *zp; 1599 dsl_dir_t *dd; 1600 1601 /* 1602 * If someone has not already unmounted this file system, 1603 * drain the zrele_taskq to ensure all active references to the 1604 * zfsvfs_t have been handled only then can it be safely destroyed. 1605 */ 1606 if (zfsvfs->z_os) { 1607 /* 1608 * If we're unmounting we have to wait for the list to 1609 * drain completely. 1610 * 1611 * If we're not unmounting there's no guarantee the list 1612 * will drain completely, but zreles run from the taskq 1613 * may add the parents of dir-based xattrs to the taskq 1614 * so we want to wait for these. 1615 * 1616 * We can safely check z_all_znodes for being empty because the 1617 * VFS has already blocked operations which add to it. 1618 */ 1619 int round = 0; 1620 while (!list_is_empty(&zfsvfs->z_all_znodes)) { 1621 taskq_wait_outstanding(dsl_pool_zrele_taskq( 1622 dmu_objset_pool(zfsvfs->z_os)), 0); 1623 if (++round > 1 && !unmounting) 1624 break; 1625 } 1626 } 1627 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1628 1629 if (!unmounting) { 1630 /* 1631 * We purge the parent filesystem's vfsp as the parent 1632 * filesystem and all of its snapshots have their vnode's 1633 * v_vfsp set to the parent's filesystem's vfsp. Note, 1634 * 'z_parent' is self referential for non-snapshots. 1635 */ 1636 #ifdef FREEBSD_NAMECACHE 1637 cache_purgevfs(zfsvfs->z_parent->z_vfs); 1638 #endif 1639 } 1640 1641 /* 1642 * Close the zil. NB: Can't close the zil while zfs_inactive 1643 * threads are blocked as zil_close can call zfs_inactive. 1644 */ 1645 if (zfsvfs->z_log) { 1646 zil_close(zfsvfs->z_log); 1647 zfsvfs->z_log = NULL; 1648 } 1649 1650 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); 1651 1652 /* 1653 * If we are not unmounting (ie: online recv) and someone already 1654 * unmounted this file system while we were doing the switcheroo, 1655 * or a reopen of z_os failed then just bail out now. 1656 */ 1657 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 1658 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1659 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1660 return (SET_ERROR(EIO)); 1661 } 1662 1663 /* 1664 * At this point there are no vops active, and any new vops will 1665 * fail with EIO since we have z_teardown_lock for writer (only 1666 * relevant for forced unmount). 1667 * 1668 * Release all holds on dbufs. 1669 */ 1670 mutex_enter(&zfsvfs->z_znodes_lock); 1671 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 1672 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 1673 if (zp->z_sa_hdl != NULL) { 1674 zfs_znode_dmu_fini(zp); 1675 } 1676 } 1677 mutex_exit(&zfsvfs->z_znodes_lock); 1678 1679 /* 1680 * If we are unmounting, set the unmounted flag and let new vops 1681 * unblock. zfs_inactive will have the unmounted behavior, and all 1682 * other vops will fail with EIO. 1683 */ 1684 if (unmounting) { 1685 zfsvfs->z_unmounted = B_TRUE; 1686 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 1687 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1688 } 1689 1690 /* 1691 * z_os will be NULL if there was an error in attempting to reopen 1692 * zfsvfs, so just return as the properties had already been 1693 * unregistered and cached data had been evicted before. 1694 */ 1695 if (zfsvfs->z_os == NULL) 1696 return (0); 1697 1698 /* 1699 * Unregister properties. 1700 */ 1701 zfs_unregister_callbacks(zfsvfs); 1702 1703 /* 1704 * Evict cached data. We must write out any dirty data before 1705 * disowning the dataset. 1706 */ 1707 objset_t *os = zfsvfs->z_os; 1708 boolean_t os_dirty = B_FALSE; 1709 for (int t = 0; t < TXG_SIZE; t++) { 1710 if (dmu_objset_is_dirty(os, t)) { 1711 os_dirty = B_TRUE; 1712 break; 1713 } 1714 } 1715 if (!zfs_is_readonly(zfsvfs) && os_dirty) 1716 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 1717 dmu_objset_evict_dbufs(zfsvfs->z_os); 1718 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; 1719 dsl_dir_cancel_waiters(dd); 1720 1721 return (0); 1722 } 1723 1724 static int 1725 zfs_umount(vfs_t *vfsp, int fflag) 1726 { 1727 kthread_t *td = curthread; 1728 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1729 objset_t *os; 1730 cred_t *cr = td->td_ucred; 1731 int ret; 1732 1733 ret = secpolicy_fs_unmount(cr, vfsp); 1734 if (ret) { 1735 if (dsl_deleg_access((char *)vfsp->vfs_resource, 1736 ZFS_DELEG_PERM_MOUNT, cr)) 1737 return (ret); 1738 } 1739 1740 /* 1741 * Unmount any snapshots mounted under .zfs before unmounting the 1742 * dataset itself. 1743 */ 1744 if (zfsvfs->z_ctldir != NULL) { 1745 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) 1746 return (ret); 1747 } 1748 1749 if (fflag & MS_FORCE) { 1750 /* 1751 * Mark file system as unmounted before calling 1752 * vflush(FORCECLOSE). This way we ensure no future vnops 1753 * will be called and risk operating on DOOMED vnodes. 1754 */ 1755 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); 1756 zfsvfs->z_unmounted = B_TRUE; 1757 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 1758 } 1759 1760 /* 1761 * Flush all the files. 1762 */ 1763 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); 1764 if (ret != 0) 1765 return (ret); 1766 while (taskqueue_cancel(zfsvfs_taskq->tq_queue, 1767 &zfsvfs->z_unlinked_drain_task, NULL) != 0) 1768 taskqueue_drain(zfsvfs_taskq->tq_queue, 1769 &zfsvfs->z_unlinked_drain_task); 1770 1771 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); 1772 os = zfsvfs->z_os; 1773 1774 /* 1775 * z_os will be NULL if there was an error in 1776 * attempting to reopen zfsvfs. 1777 */ 1778 if (os != NULL) { 1779 /* 1780 * Unset the objset user_ptr. 1781 */ 1782 mutex_enter(&os->os_user_ptr_lock); 1783 dmu_objset_set_user(os, NULL); 1784 mutex_exit(&os->os_user_ptr_lock); 1785 1786 /* 1787 * Finally release the objset 1788 */ 1789 dmu_objset_disown(os, B_TRUE, zfsvfs); 1790 } 1791 1792 /* 1793 * We can now safely destroy the '.zfs' directory node. 1794 */ 1795 if (zfsvfs->z_ctldir != NULL) 1796 zfsctl_destroy(zfsvfs); 1797 zfs_freevfs(vfsp); 1798 1799 return (0); 1800 } 1801 1802 static int 1803 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) 1804 { 1805 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1806 znode_t *zp; 1807 int err; 1808 1809 /* 1810 * zfs_zget() can't operate on virtual entries like .zfs/ or 1811 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. 1812 * This will make NFS to switch to LOOKUP instead of using VGET. 1813 */ 1814 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || 1815 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) 1816 return (EOPNOTSUPP); 1817 1818 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1819 return (err); 1820 err = zfs_zget(zfsvfs, ino, &zp); 1821 if (err == 0 && zp->z_unlinked) { 1822 vrele(ZTOV(zp)); 1823 err = EINVAL; 1824 } 1825 if (err == 0) 1826 *vpp = ZTOV(zp); 1827 zfs_exit(zfsvfs, FTAG); 1828 if (err == 0) { 1829 err = vn_lock(*vpp, flags); 1830 if (err != 0) 1831 vrele(*vpp); 1832 #if __FreeBSD_version >= 1500040 1833 else if ((zp->z_pflags & ZFS_XATTR) != 0) { 1834 if ((*vpp)->v_type == VDIR) 1835 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); 1836 else 1837 vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); 1838 } 1839 #endif 1840 } 1841 if (err != 0) 1842 *vpp = NULL; 1843 return (err); 1844 } 1845 1846 static int 1847 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, 1848 struct ucred **credanonp, int *numsecflavors, int *secflavors) 1849 { 1850 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1851 1852 /* 1853 * If this is regular file system vfsp is the same as 1854 * zfsvfs->z_parent->z_vfs, but if it is snapshot, 1855 * zfsvfs->z_parent->z_vfs represents parent file system 1856 * which we have to use here, because only this file system 1857 * has mnt_export configured. 1858 */ 1859 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, 1860 credanonp, numsecflavors, secflavors)); 1861 } 1862 1863 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN, 1864 "struct fid bigger than SHORT_FID_LEN"); 1865 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN, 1866 "struct fid bigger than LONG_FID_LEN"); 1867 1868 static int 1869 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) 1870 { 1871 struct componentname cn; 1872 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1873 znode_t *zp; 1874 vnode_t *dvp; 1875 uint64_t object = 0; 1876 uint64_t fid_gen = 0; 1877 uint64_t setgen = 0; 1878 uint64_t gen_mask; 1879 uint64_t zp_gen; 1880 int i, err; 1881 1882 *vpp = NULL; 1883 1884 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1885 return (err); 1886 1887 /* 1888 * On FreeBSD we can get snapshot's mount point or its parent file 1889 * system mount point depending if snapshot is already mounted or not. 1890 */ 1891 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { 1892 zfid_long_t *zlfid = (zfid_long_t *)fidp; 1893 uint64_t objsetid = 0; 1894 1895 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 1896 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 1897 1898 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 1899 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 1900 1901 zfs_exit(zfsvfs, FTAG); 1902 1903 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 1904 if (err) 1905 return (SET_ERROR(EINVAL)); 1906 if ((err = zfs_enter(zfsvfs, FTAG)) != 0) 1907 return (err); 1908 } 1909 1910 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 1911 zfid_short_t *zfid = (zfid_short_t *)fidp; 1912 1913 for (i = 0; i < sizeof (zfid->zf_object); i++) 1914 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 1915 1916 for (i = 0; i < sizeof (zfid->zf_gen); i++) 1917 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 1918 } else { 1919 zfs_exit(zfsvfs, FTAG); 1920 return (SET_ERROR(EINVAL)); 1921 } 1922 1923 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { 1924 zfs_exit(zfsvfs, FTAG); 1925 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", 1926 (u_longlong_t)fid_gen, (u_longlong_t)setgen); 1927 return (SET_ERROR(EINVAL)); 1928 } 1929 1930 /* 1931 * A zero fid_gen means we are in .zfs or the .zfs/snapshot 1932 * directory tree. If the object == zfsvfs->z_shares_dir, then 1933 * we are in the .zfs/shares directory tree. 1934 */ 1935 if ((fid_gen == 0 && 1936 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || 1937 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { 1938 zfs_exit(zfsvfs, FTAG); 1939 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); 1940 if (object == ZFSCTL_INO_SNAPDIR) { 1941 cn.cn_nameptr = "snapshot"; 1942 cn.cn_namelen = strlen(cn.cn_nameptr); 1943 cn.cn_nameiop = LOOKUP; 1944 cn.cn_flags = ISLASTCN | LOCKLEAF; 1945 cn.cn_lkflags = flags; 1946 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1947 vput(dvp); 1948 } else if (object == zfsvfs->z_shares_dir) { 1949 /* 1950 * XXX This branch must not be taken, 1951 * if it is, then the lookup below will 1952 * explode. 1953 */ 1954 cn.cn_nameptr = "shares"; 1955 cn.cn_namelen = strlen(cn.cn_nameptr); 1956 cn.cn_nameiop = LOOKUP; 1957 cn.cn_flags = ISLASTCN; 1958 cn.cn_lkflags = flags; 1959 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); 1960 vput(dvp); 1961 } else { 1962 *vpp = dvp; 1963 } 1964 return (err); 1965 } 1966 1967 gen_mask = -1ULL >> (64 - 8 * i); 1968 1969 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object, 1970 (u_longlong_t)fid_gen, 1971 (u_longlong_t)gen_mask); 1972 if ((err = zfs_zget(zfsvfs, object, &zp))) { 1973 zfs_exit(zfsvfs, FTAG); 1974 return (err); 1975 } 1976 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1977 sizeof (uint64_t)); 1978 zp_gen = zp_gen & gen_mask; 1979 if (zp_gen == 0) 1980 zp_gen = 1; 1981 if (zp->z_unlinked || zp_gen != fid_gen) { 1982 dprintf("znode gen (%llu) != fid gen (%llu)\n", 1983 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen); 1984 vrele(ZTOV(zp)); 1985 zfs_exit(zfsvfs, FTAG); 1986 return (SET_ERROR(EINVAL)); 1987 } 1988 1989 *vpp = ZTOV(zp); 1990 zfs_exit(zfsvfs, FTAG); 1991 err = vn_lock(*vpp, flags); 1992 if (err == 0) { 1993 vnode_create_vobject(*vpp, zp->z_size, curthread); 1994 #if __FreeBSD_version >= 1500040 1995 if ((zp->z_pflags & ZFS_XATTR) != 0) { 1996 if ((*vpp)->v_type == VDIR) 1997 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); 1998 else 1999 vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); 2000 } 2001 #endif 2002 } else 2003 *vpp = NULL; 2004 return (err); 2005 } 2006 2007 /* 2008 * Block out VOPs and close zfsvfs_t::z_os 2009 * 2010 * Note, if successful, then we return with the 'z_teardown_lock' and 2011 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2012 * dataset and objset intact so that they can be atomically handed off during 2013 * a subsequent rollback or recv operation and the resume thereafter. 2014 */ 2015 int 2016 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2017 { 2018 int error; 2019 2020 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2021 return (error); 2022 2023 return (0); 2024 } 2025 2026 /* 2027 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2028 * is an invariant across any of the operations that can be performed while the 2029 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2030 * are the same: the relevant objset and associated dataset are owned by 2031 * zfsvfs, held, and long held on entry. 2032 */ 2033 int 2034 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2035 { 2036 int err; 2037 znode_t *zp; 2038 2039 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2040 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2041 2042 /* 2043 * We already own this, so just update the objset_t, as the one we 2044 * had before may have been evicted. 2045 */ 2046 objset_t *os; 2047 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2048 VERIFY(dsl_dataset_long_held(ds)); 2049 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2050 dsl_pool_config_enter(dp, FTAG); 2051 VERIFY0(dmu_objset_from_ds(ds, &os)); 2052 dsl_pool_config_exit(dp, FTAG); 2053 2054 err = zfsvfs_init(zfsvfs, os); 2055 if (err != 0) 2056 goto bail; 2057 2058 ds->ds_dir->dd_activity_cancelled = B_FALSE; 2059 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); 2060 2061 zfs_set_fuid_feature(zfsvfs); 2062 2063 /* 2064 * Attempt to re-establish all the active znodes with 2065 * their dbufs. If a zfs_rezget() fails, then we'll let 2066 * any potential callers discover that via zfs_enter_verify_zp 2067 * when they try to use their znode. 2068 */ 2069 mutex_enter(&zfsvfs->z_znodes_lock); 2070 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2071 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2072 (void) zfs_rezget(zp); 2073 } 2074 mutex_exit(&zfsvfs->z_znodes_lock); 2075 2076 bail: 2077 /* release the VOPs */ 2078 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2079 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2080 2081 if (err) { 2082 /* 2083 * Since we couldn't setup the sa framework, try to force 2084 * unmount this file system. 2085 */ 2086 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { 2087 vfs_ref(zfsvfs->z_vfs); 2088 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); 2089 } 2090 } 2091 return (err); 2092 } 2093 2094 static void 2095 zfs_freevfs(vfs_t *vfsp) 2096 { 2097 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2098 2099 zfsvfs_free(zfsvfs); 2100 2101 atomic_dec_32(&zfs_active_fs_count); 2102 } 2103 2104 #ifdef __i386__ 2105 static int desiredvnodes_backup; 2106 #include <sys/vmmeter.h> 2107 2108 2109 #include <vm/vm_page.h> 2110 #include <vm/vm_object.h> 2111 #include <vm/vm_kern.h> 2112 #include <vm/vm_map.h> 2113 #endif 2114 2115 static void 2116 zfs_vnodes_adjust(void) 2117 { 2118 #ifdef __i386__ 2119 int newdesiredvnodes; 2120 2121 desiredvnodes_backup = desiredvnodes; 2122 2123 /* 2124 * We calculate newdesiredvnodes the same way it is done in 2125 * vntblinit(). If it is equal to desiredvnodes, it means that 2126 * it wasn't tuned by the administrator and we can tune it down. 2127 */ 2128 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * 2129 vm_kmem_size / (5 * (sizeof (struct vm_object) + 2130 sizeof (struct vnode)))); 2131 if (newdesiredvnodes == desiredvnodes) 2132 desiredvnodes = (3 * newdesiredvnodes) / 4; 2133 #endif 2134 } 2135 2136 static void 2137 zfs_vnodes_adjust_back(void) 2138 { 2139 2140 #ifdef __i386__ 2141 desiredvnodes = desiredvnodes_backup; 2142 #endif 2143 } 2144 2145 static struct sx zfs_vnlru_lock; 2146 static struct vnode *zfs_vnlru_marker; 2147 static arc_prune_t *zfs_prune; 2148 2149 static void 2150 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) 2151 { 2152 if (nr_to_scan > INT_MAX) 2153 nr_to_scan = INT_MAX; 2154 sx_xlock(&zfs_vnlru_lock); 2155 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); 2156 sx_xunlock(&zfs_vnlru_lock); 2157 } 2158 2159 void 2160 zfs_init(void) 2161 { 2162 2163 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); 2164 2165 /* 2166 * Initialize .zfs directory structures 2167 */ 2168 zfsctl_init(); 2169 2170 /* 2171 * Initialize znode cache, vnode ops, etc... 2172 */ 2173 zfs_znode_init(); 2174 2175 /* 2176 * Reduce number of vnodes. Originally number of vnodes is calculated 2177 * with UFS inode in mind. We reduce it here, because it's too big for 2178 * ZFS/i386. 2179 */ 2180 zfs_vnodes_adjust(); 2181 2182 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); 2183 2184 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); 2185 2186 zfs_vnlru_marker = vnlru_alloc_marker(); 2187 sx_init(&zfs_vnlru_lock, "zfs vnlru lock"); 2188 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL); 2189 } 2190 2191 void 2192 zfs_fini(void) 2193 { 2194 arc_remove_prune_callback(zfs_prune); 2195 vnlru_free_marker(zfs_vnlru_marker); 2196 sx_destroy(&zfs_vnlru_lock); 2197 2198 taskq_destroy(zfsvfs_taskq); 2199 zfsctl_fini(); 2200 zfs_znode_fini(); 2201 zfs_vnodes_adjust_back(); 2202 } 2203 2204 int 2205 zfs_busy(void) 2206 { 2207 return (zfs_active_fs_count != 0); 2208 } 2209 2210 /* 2211 * Release VOPs and unmount a suspended filesystem. 2212 */ 2213 int 2214 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2215 { 2216 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); 2217 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); 2218 2219 /* 2220 * We already own this, so just hold and rele it to update the 2221 * objset_t, as the one we had before may have been evicted. 2222 */ 2223 objset_t *os; 2224 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2225 VERIFY(dsl_dataset_long_held(ds)); 2226 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); 2227 dsl_pool_config_enter(dp, FTAG); 2228 VERIFY0(dmu_objset_from_ds(ds, &os)); 2229 dsl_pool_config_exit(dp, FTAG); 2230 zfsvfs->z_os = os; 2231 2232 /* release the VOPs */ 2233 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); 2234 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); 2235 2236 /* 2237 * Try to force unmount this file system. 2238 */ 2239 (void) zfs_umount(zfsvfs->z_vfs, 0); 2240 zfsvfs->z_unmounted = B_TRUE; 2241 return (0); 2242 } 2243 2244 int 2245 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2246 { 2247 int error; 2248 objset_t *os = zfsvfs->z_os; 2249 dmu_tx_t *tx; 2250 2251 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2252 return (SET_ERROR(EINVAL)); 2253 2254 if (newvers < zfsvfs->z_version) 2255 return (SET_ERROR(EINVAL)); 2256 2257 if (zfs_spa_version_map(newvers) > 2258 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2259 return (SET_ERROR(ENOTSUP)); 2260 2261 tx = dmu_tx_create(os); 2262 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2263 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2264 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2265 ZFS_SA_ATTRS); 2266 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2267 } 2268 error = dmu_tx_assign(tx, DMU_TX_WAIT); 2269 if (error) { 2270 dmu_tx_abort(tx); 2271 return (error); 2272 } 2273 2274 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2275 8, 1, &newvers, tx); 2276 2277 if (error) { 2278 dmu_tx_commit(tx); 2279 return (error); 2280 } 2281 2282 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2283 uint64_t sa_obj; 2284 2285 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2286 SPA_VERSION_SA); 2287 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2288 DMU_OT_NONE, 0, tx); 2289 2290 error = zap_add(os, MASTER_NODE_OBJ, 2291 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2292 ASSERT0(error); 2293 2294 VERIFY0(sa_set_sa_object(os, sa_obj)); 2295 sa_register_update_callback(os, zfs_sa_upgrade); 2296 } 2297 2298 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2299 "from %ju to %ju", (uintmax_t)zfsvfs->z_version, 2300 (uintmax_t)newvers); 2301 dmu_tx_commit(tx); 2302 2303 zfsvfs->z_version = newvers; 2304 os->os_version = newvers; 2305 2306 zfs_set_fuid_feature(zfsvfs); 2307 2308 return (0); 2309 } 2310 2311 int 2312 zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota) 2313 { 2314 int error; 2315 objset_t *os = zfsvfs->z_os; 2316 const char *propstr = zfs_prop_to_name(prop); 2317 dmu_tx_t *tx; 2318 2319 tx = dmu_tx_create(os); 2320 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr); 2321 error = dmu_tx_assign(tx, DMU_TX_WAIT); 2322 if (error) { 2323 dmu_tx_abort(tx); 2324 return (error); 2325 } 2326 2327 if (quota == 0) { 2328 error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx); 2329 if (error == ENOENT) 2330 error = 0; 2331 } else { 2332 error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1, 2333 "a, tx); 2334 } 2335 2336 if (error) 2337 goto out; 2338 2339 switch (prop) { 2340 case ZFS_PROP_DEFAULTUSERQUOTA: 2341 zfsvfs->z_defaultuserquota = quota; 2342 break; 2343 case ZFS_PROP_DEFAULTGROUPQUOTA: 2344 zfsvfs->z_defaultgroupquota = quota; 2345 break; 2346 case ZFS_PROP_DEFAULTPROJECTQUOTA: 2347 zfsvfs->z_defaultprojectquota = quota; 2348 break; 2349 case ZFS_PROP_DEFAULTUSEROBJQUOTA: 2350 zfsvfs->z_defaultuserobjquota = quota; 2351 break; 2352 case ZFS_PROP_DEFAULTGROUPOBJQUOTA: 2353 zfsvfs->z_defaultgroupobjquota = quota; 2354 break; 2355 case ZFS_PROP_DEFAULTPROJECTOBJQUOTA: 2356 zfsvfs->z_defaultprojectobjquota = quota; 2357 break; 2358 default: 2359 break; 2360 } 2361 2362 out: 2363 dmu_tx_commit(tx); 2364 return (error); 2365 } 2366 2367 /* 2368 * Return true if the corresponding vfs's unmounted flag is set. 2369 * Otherwise return false. 2370 * If this function returns true we know VFS unmount has been initiated. 2371 */ 2372 boolean_t 2373 zfs_get_vfs_flag_unmounted(objset_t *os) 2374 { 2375 zfsvfs_t *zfvp; 2376 boolean_t unmounted = B_FALSE; 2377 2378 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS); 2379 2380 mutex_enter(&os->os_user_ptr_lock); 2381 zfvp = dmu_objset_get_user(os); 2382 if (zfvp != NULL && zfvp->z_vfs != NULL && 2383 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) 2384 unmounted = B_TRUE; 2385 mutex_exit(&os->os_user_ptr_lock); 2386 2387 return (unmounted); 2388 } 2389 2390 #ifdef _KERNEL 2391 void 2392 zfsvfs_update_fromname(const char *oldname, const char *newname) 2393 { 2394 char tmpbuf[MAXPATHLEN]; 2395 struct mount *mp; 2396 char *fromname; 2397 size_t oldlen; 2398 2399 oldlen = strlen(oldname); 2400 2401 mtx_lock(&mountlist_mtx); 2402 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2403 fromname = mp->mnt_stat.f_mntfromname; 2404 if (strcmp(fromname, oldname) == 0) { 2405 (void) strlcpy(fromname, newname, 2406 sizeof (mp->mnt_stat.f_mntfromname)); 2407 continue; 2408 } 2409 if (strncmp(fromname, oldname, oldlen) == 0 && 2410 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { 2411 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", 2412 newname, fromname + oldlen); 2413 (void) strlcpy(fromname, tmpbuf, 2414 sizeof (mp->mnt_stat.f_mntfromname)); 2415 continue; 2416 } 2417 } 2418 mtx_unlock(&mountlist_mtx); 2419 } 2420 #endif 2421 2422 /* 2423 * Find a prison with ZFS info. 2424 * Return the ZFS info and the (locked) prison. 2425 */ 2426 static struct zfs_jailparam * 2427 zfs_jailparam_find(struct prison *spr, struct prison **prp) 2428 { 2429 struct prison *pr; 2430 struct zfs_jailparam *zjp; 2431 2432 for (pr = spr; ; pr = pr->pr_parent) { 2433 mtx_lock(&pr->pr_mtx); 2434 if (pr == &prison0) { 2435 zjp = &zfs_jailparam0; 2436 break; 2437 } 2438 zjp = osd_jail_get(pr, zfs_jailparam_slot); 2439 if (zjp != NULL) 2440 break; 2441 mtx_unlock(&pr->pr_mtx); 2442 } 2443 *prp = pr; 2444 2445 return (zjp); 2446 } 2447 2448 /* 2449 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the 2450 * ZFS info and lock the prison. 2451 */ 2452 static void 2453 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) 2454 { 2455 struct prison *ppr; 2456 struct zfs_jailparam *zjp, *nzjp; 2457 void **rsv; 2458 2459 /* If this prison already has ZFS info, return that. */ 2460 zjp = zfs_jailparam_find(pr, &ppr); 2461 if (ppr == pr) 2462 goto done; 2463 2464 /* 2465 * Allocate a new info record. Then check again, in case something 2466 * changed during the allocation. 2467 */ 2468 mtx_unlock(&ppr->pr_mtx); 2469 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); 2470 rsv = osd_reserve(zfs_jailparam_slot); 2471 zjp = zfs_jailparam_find(pr, &ppr); 2472 if (ppr == pr) { 2473 free(nzjp, M_PRISON); 2474 osd_free_reserved(rsv); 2475 goto done; 2476 } 2477 /* Inherit the initial values from the ancestor. */ 2478 mtx_lock(&pr->pr_mtx); 2479 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); 2480 (void) memcpy(nzjp, zjp, sizeof (*zjp)); 2481 zjp = nzjp; 2482 mtx_unlock(&ppr->pr_mtx); 2483 done: 2484 if (zjpp != NULL) 2485 *zjpp = zjp; 2486 else 2487 mtx_unlock(&pr->pr_mtx); 2488 } 2489 2490 /* 2491 * Jail OSD methods for ZFS VFS info. 2492 */ 2493 static int 2494 zfs_jailparam_create(void *obj, void *data) 2495 { 2496 struct prison *pr = obj; 2497 struct vfsoptlist *opts = data; 2498 int jsys; 2499 2500 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && 2501 jsys == JAIL_SYS_INHERIT) 2502 return (0); 2503 /* 2504 * Inherit a prison's initial values from its parent 2505 * (different from JAIL_SYS_INHERIT which also inherits changes). 2506 */ 2507 zfs_jailparam_alloc(pr, NULL); 2508 return (0); 2509 } 2510 2511 static int 2512 zfs_jailparam_get(void *obj, void *data) 2513 { 2514 struct prison *ppr, *pr = obj; 2515 struct vfsoptlist *opts = data; 2516 struct zfs_jailparam *zjp; 2517 int jsys, error; 2518 2519 zjp = zfs_jailparam_find(pr, &ppr); 2520 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; 2521 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); 2522 if (error != 0 && error != ENOENT) 2523 goto done; 2524 if (jsys == JAIL_SYS_NEW) { 2525 error = vfs_setopt(opts, "zfs.mount_snapshot", 2526 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); 2527 if (error != 0 && error != ENOENT) 2528 goto done; 2529 } else { 2530 /* 2531 * If this prison is inheriting its ZFS info, report 2532 * empty/zero parameters. 2533 */ 2534 static int mount_snapshot = 0; 2535 2536 error = vfs_setopt(opts, "zfs.mount_snapshot", 2537 &mount_snapshot, sizeof (mount_snapshot)); 2538 if (error != 0 && error != ENOENT) 2539 goto done; 2540 } 2541 error = 0; 2542 done: 2543 mtx_unlock(&ppr->pr_mtx); 2544 return (error); 2545 } 2546 2547 static int 2548 zfs_jailparam_set(void *obj, void *data) 2549 { 2550 struct prison *pr = obj; 2551 struct prison *ppr; 2552 struct vfsoptlist *opts = data; 2553 int error, jsys, mount_snapshot; 2554 2555 /* Set the parameters, which should be correct. */ 2556 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2557 if (error == ENOENT) 2558 jsys = -1; 2559 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2560 sizeof (mount_snapshot)); 2561 if (error == ENOENT) 2562 mount_snapshot = -1; 2563 else 2564 jsys = JAIL_SYS_NEW; 2565 switch (jsys) { 2566 case JAIL_SYS_NEW: 2567 { 2568 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ 2569 struct zfs_jailparam *zjp; 2570 2571 /* 2572 * A child jail cannot have more permissions than its parent 2573 */ 2574 if (pr->pr_parent != &prison0) { 2575 zjp = zfs_jailparam_find(pr->pr_parent, &ppr); 2576 mtx_unlock(&ppr->pr_mtx); 2577 if (zjp->mount_snapshot < mount_snapshot) { 2578 return (EPERM); 2579 } 2580 } 2581 zfs_jailparam_alloc(pr, &zjp); 2582 if (mount_snapshot != -1) 2583 zjp->mount_snapshot = mount_snapshot; 2584 mtx_unlock(&pr->pr_mtx); 2585 break; 2586 } 2587 case JAIL_SYS_INHERIT: 2588 /* "zfs=inherit": inherit the parent's ZFS info. */ 2589 mtx_lock(&pr->pr_mtx); 2590 osd_jail_del(pr, zfs_jailparam_slot); 2591 mtx_unlock(&pr->pr_mtx); 2592 break; 2593 case -1: 2594 /* 2595 * If the setting being changed is not ZFS related 2596 * then do nothing. 2597 */ 2598 break; 2599 } 2600 2601 return (0); 2602 } 2603 2604 static int 2605 zfs_jailparam_check(void *obj __unused, void *data) 2606 { 2607 struct vfsoptlist *opts = data; 2608 int error, jsys, mount_snapshot; 2609 2610 /* Check that the parameters are correct. */ 2611 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); 2612 if (error != ENOENT) { 2613 if (error != 0) 2614 return (error); 2615 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) 2616 return (EINVAL); 2617 } 2618 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, 2619 sizeof (mount_snapshot)); 2620 if (error != ENOENT) { 2621 if (error != 0) 2622 return (error); 2623 if (mount_snapshot != 0 && mount_snapshot != 1) 2624 return (EINVAL); 2625 } 2626 return (0); 2627 } 2628 2629 static void 2630 zfs_jailparam_destroy(void *data) 2631 { 2632 2633 free(data, M_PRISON); 2634 } 2635 2636 static void 2637 zfs_jailparam_sysinit(void *arg __unused) 2638 { 2639 struct prison *pr; 2640 osd_method_t methods[PR_MAXMETHOD] = { 2641 [PR_METHOD_CREATE] = zfs_jailparam_create, 2642 [PR_METHOD_GET] = zfs_jailparam_get, 2643 [PR_METHOD_SET] = zfs_jailparam_set, 2644 [PR_METHOD_CHECK] = zfs_jailparam_check, 2645 }; 2646 2647 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); 2648 /* Copy the defaults to any existing prisons. */ 2649 sx_slock(&allprison_lock); 2650 TAILQ_FOREACH(pr, &allprison, pr_list) 2651 zfs_jailparam_alloc(pr, NULL); 2652 sx_sunlock(&allprison_lock); 2653 } 2654 2655 static void 2656 zfs_jailparam_sysuninit(void *arg __unused) 2657 { 2658 2659 osd_jail_deregister(zfs_jailparam_slot); 2660 } 2661 2662 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2663 zfs_jailparam_sysinit, NULL); 2664 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, 2665 zfs_jailparam_sysuninit, NULL); 2666